1 Gene expression and network analysis

1.1 Load data

require(RCurl)
# expression data 
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2

# metadata
meta2 <- read.delim(text = getURL("https://raw.githubusercontent.com/IanBrettell/AD_project/master/Working/key_metadata.txt"), header = T)

1.2 Create initial exploratory plots

1.2.1 Histogram of cohort’s age

hist(meta2$Age)

1.2.2 Create data frame to plot whole dataset

library(tidyverse)
exdata1 <- as_tibble(exdata)
exdata1 <- as_tibble(t(exdata1)) # transpose to put samples in rows, genes in columns
colnames(exdata1) <- rownames(exdata) # add genes names
exdata1$AIBL.Id <- as.integer(colnames(exdata)) # add 'AIBL.Id' column
exdata2 <- left_join(meta2, exdata1, by = "AIBL.Id") %>% 
  na.omit # add metadata and remove PET and apoe4 columns with 'NA'
rownames(exdata2) <- exdata2$AIBL.Id # to ensure the labels in the plots reflect the AIBL.Ids
dim(exdata2)
## [1]   196 22016
exdata2[, 1:10]
##      AIBL.Id Demographic.Sex PET apoe4      Age  2315554  2315633  2315674
## 4          4          Female POS     0 84.63518 6.480369 6.891905 6.565621
## 5          5            Male NEG     0 73.14168 6.314319 6.779152 6.349589
## 12        12            Male POS     1 78.88843 6.290770 6.567759 6.436518
## 14        14          Female POS     1 65.47023 5.948114 6.681805 6.175586
## 21        21          Female NEG     0 73.46475 6.037015 6.485205 6.281951
## 23        23          Female NEG     0 83.21971 6.186476 6.513650 6.219359
## 25        25          Female POS     0 79.72348 6.076614 6.657682 6.498325
## 26        26            Male POS     0 81.75770 6.328989 6.554273 6.471516
## 28        28            Male POS     1 79.87406 6.297244 6.795406 6.312110
## 29        29          Female NEG     0 80.87337 5.866402 6.558860 6.351652
## 31        31          Female NEG     0 83.25804 5.838968 6.623398 6.056068
## 33        33            Male POS     1 82.00137 6.248705 6.666009 6.349729
## 36        36            Male POS     0 88.10951 6.493032 7.002052 6.738490
## 39        39            Male POS     0 62.62834 6.167725 6.650965 6.341245
## 40        40          Female POS     1 82.72964 6.489249 6.505397 6.549479
## 42        42            Male POS     0 81.60986 6.471965 7.012865 6.925032
## 43        43          Female NEG     0 81.20739 6.268364 6.526505 6.445091
## 47        47          Female NEG     1 78.89938 6.464944 6.714745 6.362125
## 50        50          Female NEG     0 65.41273 6.406922 6.548416 6.622533
## 52        52          Female NEG     0 72.43806 6.323187 6.939356 6.567885
## 53        53            Male POS     1 79.26899 5.872199 6.444315 6.278965
## 55        55            Male POS     0 78.03696 6.576354 6.636825 6.511823
## 56        56            Male NEG     0 79.14853 6.501945 6.965795 6.661681
## 57        57            Male POS     1 77.78508 6.116833 6.320499 6.306403
## 59        59          Female POS     0 77.88638 6.382484 6.473893 6.468157
## 60        60          Female POS     0 88.07940 5.758869 6.401347 5.973544
## 61        61          Female NEG     0 79.66051 6.469268 6.596029 6.642933
## 62        62          Female POS     0 75.66872 5.887743 6.969227 6.277725
## 64        64          Female NEG     0 78.34634 6.586534 6.619251 6.520611
## 66        66          Female POS     1 69.35524 6.469714 6.977623 6.569762
## 67        67          Female NEG     1 76.04654 6.293906 6.624161 6.545438
## 68        68          Female NEG     0 74.45585 6.255258 6.709539 6.444970
## 72        72            Male POS     0 76.38330 5.723363 6.626473 6.133998
## 75        75            Male NEG     0 75.57290 5.992771 6.479666 6.180949
## 79        79            Male POS     1 73.92471 6.304891 6.642175 6.449253
## 80        80          Female NEG     0 72.26010 6.538413 6.757661 6.394387
## 86        86            Male NEG     0 73.59343 6.202429 6.891970 6.537608
## 88        88          Female NEG     0 69.75222 6.298653 6.688547 6.350240
## 89        89            Male NEG     0 68.81314 6.156755 6.845456 6.379305
## 90        90            Male NEG     0 79.77002 6.368520 6.562456 6.455850
## 91        91          Female POS     0 61.09514 6.244896 6.715811 6.325632
## 97        97            Male NEG     0 63.32649 6.406573 6.689737 6.611027
## 99        99            Male NEG     0 84.08761 6.204438 6.601850 6.847750
## 100      100            Male POS     1 77.67283 6.249671 6.784119 6.479181
## 101      101            Male POS     1 78.60917 6.217539 6.723117 6.340995
## 102      102          Female POS     1 67.27173 6.153372 6.777107 6.202899
## 103      103            Male POS     1 73.98220 6.252399 6.572031 6.304950
## 104      104          Female NEG     0 80.33128 6.249090 6.692391 6.464357
## 110      110            Male NEG     0 85.93292 6.124358 6.425423 6.278595
## 111      111            Male POS     1 83.79466 6.790819 7.086384 7.004872
## 112      112            Male POS     1 82.55168 6.104592 6.724418 6.381245
## 113      113          Female NEG     0 77.97125 6.233691 6.623124 6.583993
## 117      117            Male POS     0 80.30938 6.283379 6.765484 6.495661
## 126      126          Female NEG     0 67.69610 6.088120 6.619230 6.457667
## 138      138          Female NEG     0 60.96920 6.462123 6.682734 6.714758
## 152      152            Male NEG     0 69.33607 6.216405 6.748362 6.286094
## 259      259          Female POS     1 79.85216 6.174100 6.739155 6.429327
## 270      270          Female NEG     1 65.14168 6.465453 6.511014 6.613107
## 273      273            Male NEG     1 64.89802 6.886670 7.265324 6.773535
## 276      276            Male NEG     0 65.25120 6.202502 6.508606 6.590262
## 277      277            Male POS     1 75.00342 6.074626 6.704253 6.368754
## 278      278          Female POS     1 67.83847 6.285486 6.725421 6.338260
## 280      280          Female NEG     0 61.01027 6.142124 6.819817 6.228511
## 287      287            Male NEG     1 64.61054 6.148718 6.611755 6.329001
## 289      289          Female NEG     0 60.61328 6.303928 6.605411 6.467153
## 294      294            Male NEG     1 62.28884 6.310482 6.757581 6.524905
## 301      301          Female NEG     0 66.70773 6.250539 6.829466 6.457082
## 310      310            Male POS     1 64.13415 6.573792 7.037530 6.636816
## 315      315          Female NEG     0 67.80835 6.024107 6.884582 6.560838
## 317      317          Female NEG     0 65.22656 6.346622 6.880260 6.345604
## 331      331            Male POS     0 72.56400 6.091366 6.539202 6.147746
## 342      342            Male NEG     0 67.57837 6.459433 6.796057 6.526122
## 349      349          Female NEG     0 72.99932 6.539442 6.664520 6.724221
## 354      354          Female POS     1 83.58658 6.379879 6.828928 6.583614
## 362      362          Female NEG     0 62.76249 6.489151 6.707376 6.504605
## 365      365          Female POS     1 80.52019 6.225867 6.579102 6.291010
## 366      366            Male POS     0 77.81520 6.610969 6.654806 6.727655
## 367      367          Female NEG     1 68.91718 6.159727 6.681467 6.240086
## 371      371          Female NEG     1 66.98426 6.265874 6.729486 6.440461
## 372      372          Female POS     0 65.78508 6.302200 6.607452 6.383913
## 373      373          Female POS     1 72.54757 6.135674 6.731264 6.309723
## 374      374          Female POS     0 64.54757 6.073725 6.798658 6.481094
## 376      376          Female POS     1 76.49008 6.616582 7.186329 6.609141
## 377      377            Male POS     0 73.93018 6.023341 6.755030 6.431724
## 378      378          Female POS     0 84.12320 6.260883 6.564279 6.617119
## 380      380            Male POS     1 69.98494 6.473748 7.031872 6.634808
## 388      388            Male POS     1 66.94593 6.172192 6.542353 6.315893
## 390      390            Male NEG     1 65.20192 6.049106 6.756951 6.334876
## 394      394          Female POS     1 74.28884 6.176199 6.742148 6.347111
## 403      403            Male NEG     1 61.62355 5.885767 6.643271 6.417598
## 406      406            Male POS     1 67.80014 6.251596 6.565337 6.412665
## 407      407          Female NEG     1 62.05065 6.387949 6.526086 6.390812
## 411      411            Male NEG     1 68.22177 6.162964 6.632073 6.394948
## 471      471          Female NEG     1 69.76318 6.247959 6.553351 6.534542
## 473      473          Female POS     1 81.44285 6.248213 6.654716 6.404635
## 477      477            Male NEG     1 81.36345 6.466705 6.732609 6.596663
## 480      480          Female NEG     1 71.78097 6.157917 6.913308 6.315593
## 482      482            Male NEG     1 75.69610 6.108606 6.564574 6.381925
## 483      483          Female POS     0 63.86585 6.335418 6.907938 6.557670
## 488      488          Female POS     1 79.45243 5.981184 6.514025 6.347060
## 498      498          Female NEG     0 61.46749 6.661186 7.028357 6.705888
## 507      507            Male POS     1 76.62697 6.461849 6.400309 6.723188
## 509      509          Female NEG     0 72.96646 6.043422 6.413871 6.200552
## 510      510            Male POS     0 61.05133 6.103153 6.522613 6.080440
## 516      516            Male NEG     0 69.31691 6.452975 6.936307 6.701086
## 518      518            Male POS     1 67.14853 6.081921 6.693521 6.384823
## 528      528            Male NEG     0 67.82204 6.167301 6.604118 6.381816
## 530      530            Male NEG     0 60.82409 6.530960 6.700354 6.722223
## 550      550          Female NEG     1 71.25804 6.444818 6.926947 6.458020
## 556      556            Male NEG     0 71.60301 6.279505 6.615724 6.410287
## 557      557          Female NEG     1 68.27105 5.928358 6.802994 6.246338
## 566      566            Male NEG     0 64.10678 6.190377 6.676997 6.452718
## 567      567          Female POS     0 63.02259 6.312758 6.829371 6.695018
## 570      570            Male POS     0 77.84257 6.288248 6.505913 6.344869
## 573      573            Male NEG     0 69.02943 6.288157 6.912437 6.307653
## 582      582            Male POS     1 84.77481 6.036305 6.688572 6.164308
## 588      588            Male POS     1 62.78439 5.925922 6.585664 6.202425
## 590      590          Female POS     1 91.78919 5.927381 6.743559 6.333311
## 609      609          Female POS     0 70.54894 6.314319 6.715392 6.283178
## 615      615            Male NEG     0 77.22382 6.113175 6.582526 6.363531
## 657      657          Female POS     1 80.16975 6.131198 6.639902 6.566688
## 658      658          Female POS     0 62.09446 6.223612 6.789396 6.432003
## 661      661            Male NEG     0 69.35250 6.144306 6.567274 6.564063
## 666      666            Male NEG     0 66.18754 6.197882 6.643357 6.285649
## 676      676          Female POS     1 77.86448 6.389598 6.701799 6.555583
## 696      696          Female NEG     1 61.30322 6.111071 6.584321 6.204916
## 697      697            Male NEG     0 64.21903 6.051107 6.409244 6.329083
## 698      698          Female NEG     0 85.55510 5.885922 6.435411 6.400886
## 699      699            Male NEG     1 72.21629 6.299913 6.326878 6.504349
## 702      702            Male POS     0 82.55441 5.861619 6.613266 6.139776
## 707      707            Male NEG     0 71.25257 6.862296 7.065670 7.127398
## 716      716            Male NEG     0 68.97741 5.849523 6.586282 6.054108
## 722      722            Male NEG     0 74.89938 6.327505 6.636835 6.403534
## 736      736          Female NEG     0 71.57563 6.140351 6.557702 6.321128
## 737      737            Male NEG     0 69.07324 5.917668 6.918993 6.152954
## 757      757            Male POS     1 70.56263 5.966071 6.561388 6.119326
## 771      771          Female NEG     0 60.16427 7.373542 7.229125 7.534851
## 772      772          Female POS     0 70.32991 6.429591 6.741193 6.671919
## 784      784          Female NEG     0 72.91718 6.251406 6.624422 6.281317
## 795      795          Female POS     0 76.85421 6.502011 6.825250 6.471669
## 796      796          Female NEG     0 74.85558 6.456219 6.838002 6.571417
## 798      798          Female NEG     0 68.69268 6.445431 6.814739 6.404046
## 808      808            Male POS     1 73.20465 6.336130 6.488948 6.350209
## 814      814            Male NEG     0 82.12457 6.032602 6.831285 6.317968
## 815      815          Female POS     1 69.62902 6.594248 6.870032 6.695476
## 819      819          Female POS     1 71.47159 6.395337 6.492036 6.575456
## 838      838            Male POS     0 82.23135 6.454275 6.981217 6.755547
## 844      844          Female NEG     0 62.23956 6.470576 6.757153 6.512149
## 851      851            Male POS     1 84.49555 6.500313 6.952976 6.675960
## 877      877            Male POS     1 75.84668 6.162636 6.782607 6.369384
## 890      890          Female POS     1 76.61875 6.479070 6.563948 6.594955
## 891      891          Female NEG     0 63.95072 5.944790 6.690265 6.218748
## 895      895            Male POS     0 73.12252 6.244164 6.816867 6.378894
## 904      904          Female NEG     1 63.04723 6.260028 6.596380 6.234968
## 906      906            Male NEG     0 75.63860 6.600079 7.066462 6.583025
## 914      914            Male NEG     0 71.90418 7.033261 7.021717 6.947102
## 917      917            Male POS     1 75.71800 6.413357 6.729174 6.739102
## 918      918          Female POS     1 58.83094 6.936336 7.146546 7.222852
## 930      930            Male POS     1 71.60849 6.253725 6.712326 6.519016
## 931      931            Male NEG     0 86.10541 6.595277 6.712869 6.622484
## 938      938          Female NEG     0 60.72005 6.010315 6.689541 6.218816
## 939      939          Female POS     1 76.46270 6.445422 6.603950 6.860858
## 942      942          Female POS     1 71.88227 6.078821 6.600787 6.319793
## 945      945          Female POS     1 72.54757 5.896341 6.987305 6.120857
## 952      952          Female POS     1 85.07050 6.440366 6.812504 6.577425
## 959      959            Male POS     1 76.41342 6.623003 6.619489 6.638507
## 965      965          Female POS     0 78.98973 6.639037 6.583663 6.843239
## 970      970            Male POS     1 78.16290 6.371120 6.836754 6.519902
## 971      971          Female POS     1 84.10130 6.113747 6.612382 6.345419
## 990      990          Female POS     0 79.70431 5.968143 6.700227 6.380947
## 993      993            Male POS     0 61.79603 6.172072 6.690875 6.351244
## 1000    1000          Female NEG     1 66.87201 6.288050 6.794610 6.584681
## 1001    1001            Male NEG     0 74.12457 6.806164 6.932284 6.621855
## 1010    1010            Male NEG     0 60.22177 6.121814 6.690690 6.419014
## 1013    1013          Female POS     1 72.71458 6.246259 6.374263 6.338173
## 1032    1032            Male POS     1 75.08556 6.334371 6.562914 6.374871
## 1050    1050            Male POS     1 64.77207 5.741934 6.678802 6.012837
## 1066    1066          Female POS     1 59.05818 6.187258 6.695226 6.483559
## 1067    1067          Female NEG     0 81.62355 5.985368 6.794342 6.059024
## 1078    1078          Female POS     1 83.41958 6.028581 6.466813 6.465285
## 1089    1089          Female POS     1 59.61944 6.214467 6.816567 6.706962
## 1091    1091          Female POS     0 70.47502 6.193594 6.131500 6.385019
## 1094    1094            Male POS     1 68.89528 6.025145 6.694290 6.502906
## 1096    1096            Male POS     1 71.56742 5.949029 6.581726 6.153074
## 1100    1100            Male POS     1 69.91376 6.233982 6.698891 6.205360
## 1102    1102            Male POS     1 71.41684 5.893660 6.551811 6.189609
## 1104    1104            Male NEG     0 72.42163 6.124993 6.534687 6.409842
## 1109    1109            Male POS     0 78.26694 6.145877 6.601277 6.405181
## 1122    1122            Male POS     1 60.35866 5.676284 6.788442 6.196706
## 1125    1125          Female NEG     0 85.11978 6.204285 6.678693 6.388874
## 1135    1135          Female POS     1 79.79466 5.934548 6.552959 6.248266
## 1139    1139            Male POS     1 55.14853 6.016007 6.934537 6.251933
## 1147    1147            Male NEG     0 60.76934 6.108658 6.714836 6.346043
## 1153    1153          Female NEG     0 75.75633 6.305337 6.887700 6.502986
## 1157    1157            Male NEG     1 67.60301 6.018838 6.725353 6.255278
## 1174    1174            Male POS     0 83.76728 6.162833 6.799695 6.459972
##       2315739  2315894
## 4    6.516567 8.176795
## 5    6.252506 8.635744
## 12   6.384537 7.963935
## 14   6.301732 8.043594
## 21   6.129048 7.945633
## 23   6.133969 7.968608
## 25   6.367791 8.240272
## 26   6.460154 8.756544
## 28   6.379973 8.661552
## 29   6.172999 8.285472
## 31   6.235320 8.119470
## 33   6.215513 8.060683
## 36   6.578067 8.828091
## 39   6.318249 8.426085
## 40   6.488439 8.371069
## 42   6.666519 8.786018
## 43   6.197191 8.188000
## 47   6.445460 8.476699
## 50   6.402781 8.389101
## 52   6.548507 8.399823
## 53   6.191993 8.172368
## 55   6.373156 8.389860
## 56   6.542421 8.621589
## 57   6.286881 8.231618
## 59   6.396478 8.412143
## 60   6.042320 8.042496
## 61   6.383923 7.818437
## 62   6.248329 8.142817
## 64   6.475421 8.077885
## 66   6.460657 8.267160
## 67   6.388726 8.091714
## 68   6.591385 8.389016
## 72   6.089050 7.638819
## 75   6.245394 8.029725
## 79   6.652024 8.249711
## 80   6.474735 8.007391
## 86   6.402931 8.297299
## 88   6.503762 8.521326
## 89   6.401387 8.665788
## 90   6.191511 8.089617
## 91   6.426800 8.399407
## 97   6.720172 8.717953
## 99   6.581944 8.220909
## 100  6.275917 7.936677
## 101  6.360023 8.455458
## 102  6.387004 8.135961
## 103  6.189617 8.048258
## 104  6.667132 8.060595
## 110  6.160518 8.067879
## 111  6.966086 8.967732
## 112  6.385202 8.012695
## 113  6.463539 8.460742
## 117  6.424220 8.395555
## 126  6.399296 8.219280
## 138  6.595803 9.324923
## 152  6.433069 8.130344
## 259  6.311515 8.487506
## 270  6.485162 7.571880
## 273  6.632758 8.556086
## 276  6.411020 7.488878
## 277  6.286240 8.469508
## 278  6.145291 8.382221
## 280  6.137446 8.230918
## 287  6.449422 8.505350
## 289  6.443845 8.325384
## 294  6.487336 7.854833
## 301  6.402543 8.157133
## 310  6.381600 7.875573
## 315  6.412068 8.418653
## 317  6.468000 8.143446
## 331  6.329408 8.128533
## 342  6.514058 8.658526
## 349  6.767479 8.098890
## 354  6.646620 7.959876
## 362  6.273409 8.596950
## 365  6.249836 7.895243
## 366  6.688523 8.673346
## 367  6.550660 8.093363
## 371  6.227942 7.830658
## 372  6.433531 8.095706
## 373  6.357371 8.256031
## 374  6.398171 8.329469
## 376  6.806113 8.325484
## 377  6.619245 8.362890
## 378  6.077606 8.060377
## 380  6.437845 8.313434
## 388  6.127493 8.013600
## 390  6.445356 8.181027
## 394  6.451033 8.369798
## 403  6.241045 7.688826
## 406  6.464875 8.119255
## 407  6.427262 8.448072
## 411  6.307937 8.530749
## 471  6.534982 8.574288
## 473  6.391032 8.577297
## 477  6.531321 8.138791
## 480  6.361480 8.353346
## 482  6.062568 8.005850
## 483  6.391227 8.274258
## 488  6.268688 8.123853
## 498  6.560862 8.571118
## 507  6.617919 8.555241
## 509  6.238213 7.816772
## 510  6.092995 8.371996
## 516  6.620882 8.796185
## 518  6.232935 8.199298
## 528  6.381270 8.076391
## 530  6.600723 8.458335
## 550  6.564602 8.257652
## 556  6.560507 8.324118
## 557  6.294528 8.458825
## 566  6.139174 8.258713
## 567  6.631280 8.579966
## 570  6.298944 8.309118
## 573  6.363114 8.533878
## 582  6.484227 8.350568
## 588  6.167237 8.102750
## 590  6.105932 8.145313
## 609  6.358180 8.711746
## 615  6.347374 7.808396
## 657  6.630271 8.411060
## 658  6.295594 8.668870
## 661  6.427691 8.158172
## 666  6.204427 7.766536
## 676  6.761706 8.707959
## 696  6.237833 8.388425
## 697  6.091262 8.242050
## 698  6.206039 8.296143
## 699  6.231725 7.766098
## 702  6.187461 8.181405
## 707  6.797781 8.444630
## 716  6.266113 8.082044
## 722  6.650641 8.631772
## 736  6.373879 8.317890
## 737  6.386691 8.162499
## 757  6.148482 8.317301
## 771  6.949544 9.848143
## 772  6.555872 8.299831
## 784  6.453931 8.557491
## 795  6.580387 8.465663
## 796  6.407076 8.677469
## 798  6.631139 8.378032
## 808  6.476528 8.268881
## 814  6.302563 8.518802
## 815  6.621854 8.654187
## 819  6.292928 8.189562
## 838  6.847003 8.376027
## 844  6.317192 8.259769
## 851  6.542828 8.538429
## 877  6.447839 8.211258
## 890  6.403709 8.523652
## 891  6.373590 8.380272
## 895  6.359654 8.045609
## 904  6.342941 8.058783
## 906  6.600772 8.721166
## 914  6.914673 8.860534
## 917  6.528452 8.671336
## 918  6.929756 9.457507
## 930  6.374662 8.126438
## 931  6.329473 8.566715
## 938  6.268462 7.888268
## 939  6.401725 8.699766
## 942  6.268897 8.011411
## 945  6.698476 8.136579
## 952  6.535240 8.552151
## 959  6.650298 8.245840
## 965  6.573021 8.112980
## 970  6.405051 8.644900
## 971  6.082821 7.816655
## 990  6.624148 8.154736
## 993  6.283437 8.478330
## 1000 6.590041 8.503554
## 1001 6.580209 8.622928
## 1010 6.404956 8.200976
## 1013 6.394821 8.342434
## 1032 6.315333 8.551981
## 1050 6.171498 7.929635
## 1066 6.204242 8.388649
## 1067 6.005664 8.000128
## 1078 5.974082 7.463696
## 1089 6.371726 9.151203
## 1091 6.120406 7.768187
## 1094 6.578118 8.651970
## 1096 6.331987 8.352128
## 1100 6.490084 8.171842
## 1102 6.346412 8.135509
## 1104 6.249486 7.952976
## 1109 6.498179 8.369956
## 1122 6.375254 8.363321
## 1125 6.307210 7.882459
## 1135 6.136527 8.130718
## 1139 6.366817 8.260760
## 1147 6.330977 8.552363
## 1153 6.466313 8.538178
## 1157 6.368882 8.001666
## 1174 6.704779 8.590583

Run PCA

pca <- prcomp(exdata2[6:22016], center = T, scale. = T) # run PCA
# plot scree for PCA
library(factoextra)
fviz_eig(pca)

PCA scatterplot for PET status

library(ggfortify)
autoplot(pca, data = exdata2, colour = "PET", label = T) +
  viridis::scale_colour_viridis(option = "viridis", discrete = T)

PET scatterplot for apoe4 status

library(ggfortify)
autoplot(pca, data = exdata2, colour = "apoe4", label = T)

qqnorm(pca$x[,2], pch = 20)

Exclude outliers and run PCA again

exdata3 <- filter(exdata2, AIBL.Id != "771" & AIBL.Id != "914" & AIBL.Id != "918")

Run PCA

pca2 <- prcomp(exdata3[6:22016], center = T, scale. = T) # run PCA
library(factoextra)
fviz_eig(pca2)

Plot without outliers (PET)

library(ggfortify)
autoplot(pca2, data = exdata3, colour = "PET", label = T, label.label = "AIBL.Id") +
  viridis::scale_colour_viridis(option = "viridis", discrete = T)

Plot without outliers (apoe4)

library(ggfortify)
autoplot(pca2, data = exdata3, colour = "apoe4", label = T, label.label = "AIBL.Id")


1.3 Use limma to analyse expression data.

We first create an ‘ExpressionSet’ with genes on rows, and samples on columns, following the guide here: https://www.bioconductor.org/packages/3.7/bioc/vignettes/Biobase/inst/doc/ExpressionSetIntroduction.pdf

library(Biobase)
exprs <- as.matrix(t(exdata3[,6:22016])) # leaving behind the first 5 metadata columns
minimalSet <- ExpressionSet(assayData = exprs)
pData <- exdata3[,1:5]
summary(pData)
##     AIBL.Id       Demographic.Sex  PET          apoe4      
##  Min.   :   4.0   Female:96       NEG: 92   Min.   :0.000  
##  1st Qu.: 110.0   Male  :97       POS:101   1st Qu.:0.000  
##  Median : 480.0                             Median :0.000  
##  Mean   : 506.2                             Mean   :0.456  
##  3rd Qu.: 819.0                             3rd Qu.:1.000  
##  Max.   :1174.0                             Max.   :1.000  
##       Age       
##  Min.   :55.15  
##  1st Qu.:67.15  
##  Median :72.71  
##  Mean   :72.86  
##  3rd Qu.:78.90  
##  Max.   :91.79
library(limma)
PET_apoe4 <- paste(pData$PET, pData$apoe4, sep = ".") # put PET/apoe4 combinations into a vector

PET_apoe4 <- factor(PET_apoe4, levels = c(unique(PET_apoe4))) # turn it into a factor
design <- model.matrix(~0 + PET_apoe4) # create factor table for four combinations
colnames(design) <- levels(PET_apoe4)
fit <- lmFit(exprs, design)

Set four pair-wise contrasts of interest and compute the contrasts and moderated t-tests.

cont.matrix <- makeContrasts(apoe4_for_PET_yes = POS.1 - POS.0,
                             apoe4_for_PET_no = NEG.1 - NEG.0,
                             PET_for_apoe4_yes = POS.1 - NEG.1,
                             PET_for_apoe4_no = POS.0 - NEG.0,
                             levels = design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)

Get the genes differentially expressed in each comparison

res1 <- topTable(fit2, coef = "apoe4_for_PET_yes", n = Inf)
res2 <- topTable(fit2, coef = "apoe4_for_PET_no", n = Inf)
res3 <- topTable(fit2, coef = "PET_for_apoe4_yes", n = Inf)
res4 <- topTable(fit2, coef = "PET_for_apoe4_no", n = Inf)

length(which(res1$P.Value < 0.05))
## [1] 653
length(which(res2$P.Value < 0.05))
## [1] 684
length(which(res3$P.Value < 0.05))
## [1] 924
length(which(res4$P.Value < 0.05))
## [1] 769

Probe IDs are set in the ‘res’ data frames as rownames. Create separate column for rownames. Need to do it now, because when filter() is applied below, it removes the rownames.

for (i in ls(pattern = "res")){
  x <- get(i)
  x$probe_id <- rownames(x)
  assign(i, x)
}

Get list of genes that show significantly different expression between contrasts

signif_apoe4_for_PET_yes <- filter(res1, P.Value < 0.05)
signif_apoe4_for_PET_no <- filter(res2, P.Value < 0.05)
signif_PET_for_apoe4_yes <- filter(res3, P.Value < 0.05)
signif_PET_for_apoe4_no <- filter(res4, P.Value < 0.05)

rm(list = ls(pattern = "res")) # remove results to clear working memory

1.4 Get gene identifiers for microarray probes and identify significant genes.

Using JD’s code from ‘AIBL_Gene_Expression_SetUp_Dec82017.R’

library(pd.huex.1.0.st.v2)
library(huex10sttranscriptcluster.db)
  
### pull gene symbols and match to probe ids to replace rownames ###
x <- huex10sttranscriptclusterSYMBOL
# Get the probe identifiers that are mapped to a gene symbol
mapped_probes <- mappedkeys(x)
# Convert to a list
xx <- as.list(x[mapped_probes])
if(length(xx) > 0) {
# Get the SYMBOL for the first five probes
xx[1:5]
# Get the first one
xx[[1]]
}
## [1] "LOC102723600"
xx1 <- as.matrix(unlist(xx))

# Get table of probes
gns <- rownames(exdata)
gnsl <- xx1[rownames(xx1) %in% gns]
length(gnsl)
## [1] 14902
### Find ENSEMBL IDs ###
x3 <- huex10sttranscriptclusterENSEMBL
# Get the entrez gene IDs that are mapped to an Ensembl ID
mapped_genes <- mappedkeys(x3)
# Convert to a list
xx3 <- as.list(x3[mapped_genes])

xx3.1 <- as.matrix(unlist(xx3))
ens <- xx3.1[rownames(xx3.1) %in% gns]
length(ens)
## [1] 13954
### Find Entrez IDs ###
x4 <- huex10sttranscriptclusterENTREZID
# Get the probe identifiers that are mapped to an ENTREZ Gene ID
mapped_probes <- mappedkeys(x4)
# Convert to a list
xx4 <- as.list(x4[mapped_probes])

xx4.1 <- as.matrix(unlist(xx4))
entz <- xx4.1[rownames(xx4.1) %in% gns]
length(entz)
## [1] 14902

1.5 Match probes, gene symbols, Ensembl IDs, and Entrez IDs with the list of statistically significant genes

Create data frame of all probe matches with identifiers

pid_symb <- data.frame(xx1)
pid_symb$probe_id <- rownames(pid_symb)

pid_ens <- data.frame(xx3.1)
pid_ens$probe_id <- rownames(pid_ens)

pid_ent <- data.frame(xx4.1)
pid_ent$probe_id <- rownames(pid_ent)

pid_all <- dplyr::left_join(pid_symb, pid_ent, by = "probe_id") %>% 
  dplyr::left_join(y = pid_ens, by = "probe_id") %>% 
  dplyr::rename(gene_symbol = "xx1",
                entrez_id = "xx4.1",
                ensembl_id = "xx3.1") %>% 
  dplyr::select(probe_id, everything())

# write.table(pid_all, "~/Desktop/Uni Air/CSIRO/AD_project/Working/affy_probes_annotated.txt", row.names = F, col.names = T, quote = F, sep = "\t")

# find how many probes are annotated
length(which(pid_all$probe_id %in% rownames(exdata)))
## [1] 14902

Combine lists of significant genes for each contrast. Note that combined counts of significant genes for all four contrasts is 3030.

signif_all <- full_join(signif_apoe4_for_PET_yes, signif_apoe4_for_PET_no, by = "probe_id") %>%
  full_join(signif_PET_for_apoe4_no, by = "probe_id") %>% 
  full_join(signif_apoe4_for_PET_yes, by = "probe_id")

signif_all <- dplyr::select(signif_apoe4_for_PET_yes, probe_id, P.Value) %>% 
  full_join(dplyr::select(signif_apoe4_for_PET_no, probe_id, P.Value), by = "probe_id") %>% 
  full_join(dplyr::select(signif_PET_for_apoe4_yes, probe_id, P.Value), by = "probe_id") %>% 
  full_join(dplyr::select(signif_PET_for_apoe4_no, probe_id, P.Value), by = "probe_id") %>% 
  dplyr::rename(P.Value.c1 = "P.Value.x", 
         P.Value.c2 = "P.Value.y", 
         P.Value.c3 = "P.Value.x.x", 
         P.Value.c4 = "P.Value.y.y")

nrow(signif_all)
## [1] 2438

Therefore 2438 genes are expressed differentially across at least two contrasts

Bind gene ids to ‘signif_all’

sig_all2 <- left_join(signif_all, pid_all, by = "probe_id")

There are many NAs in the ‘gene_symbol’ column, even though there are 34,127 unique probe_ids in ‘pid_symb’ - one would think they would all be accounted for.

length(setdiff(signif_all$probe_id, pid_all$probe_id))
## [1] 989

Shows that there are 989 probe ids in ‘signif_all’ that are not in ‘pid_all’, i.e. they do not have any annotation data.

Confirmed by:

length(which(signif_all$probe_id %in% pid_all$probe_id == "TRUE"))
## [1] 1449

So are all probe_ids in the expression data in ‘pid_symb’?

length(setdiff(rownames(exdata), pid_symb$probe_id))
## [1] 7109

7109/22011 probes are not accounted for in the annotation data?

Confirm this:

length(which(rownames(exdata) %in% pid_all$probe_id == "TRUE"))
## [1] 14902

1.5.0.1 Venn diagram of overlapping genes

library(VennDiagram)
grid.newpage()
venn <- draw.quad.venn(area1 = length(which(!is.na(sig_all2$P.Value.c1))),
               area2 = length(which(!is.na(sig_all2$P.Value.c2))),
               area3 = length(which(!is.na(sig_all2$P.Value.c3))),
               area4 = length(which(!is.na(sig_all2$P.Value.c4))),
               n12 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2))),
               n13 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c3))),
               n14 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c4))),
               n23 = length(which(!is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3))),
               n24 = length(which(!is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c4))),
               n34 = length(which(!is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
               n123 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3))),
               n124 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c4))),
               n134 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
               n234 = length(which(!is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))), 
               n1234 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
               category = c("apoe4_for_PET_yes", "apoe4_for_PET_no", "PET_for_apoe4_yes", "PET_for_apoe4_no"),
               cat.pos = c(0, 0, 0, 0))

Many of the same genes (265) are differentially expressed between: * PET pos/neg when they have the apoe4 allele; and * apoe4 yes/no when they are negative for PET

These could be genes that can protect against AD.

Many of the same genes (189) are also differentially expressed between: * PET pos/neg when they do not have the apoe4 allele; and * apoe4 yes/no when they are positive for PET

These latter genes must be acting independently of the apoe4 network to cause AD.

Find out how many people are in each group:

test <- exdata3[,1:5]
length(which(test$PET == "POS" & test$apoe4 == "1"))
## [1] 65
length(which(test$PET == "POS" & test$apoe4 == "0"))
## [1] 36
length(which(test$PET == "NEG" & test$apoe4 == "1"))
## [1] 23
length(which(test$PET == "NEG" & test$apoe4 == "0"))
## [1] 69

This suggests that more people tend to either have apoe4 and AD, or not have apoe4 and not have AD, than otherwise.


1.6 Compare expression profiles between PET-pos and PET-neg groups

Create design matrix

Group <- factor(exdata3$PET, levels = c("POS", "NEG"))
design2 <- model.matrix(~0 + Group)
colnames(design2) <- c("POS", "NEG")

Find differentially expressed genes

fit3 <- lmFit(exprs, design2)
cont.matrix2 <- makeContrasts(POSvsNEG = POS - NEG, levels = design2)
fit4 <- contrasts.fit(fit3, cont.matrix2)
fit4 <- eBayes(fit4)
topTable(fit4, adjust = "BH")
##               logFC  AveExpr         t      P.Value adj.P.Val           B
## 2328611 -0.24349043 6.931149 -3.781354 0.0002074070 0.9997102  0.50013583
## 3997946  0.14334623 9.035701  3.648961 0.0003381087 0.9997102  0.07930919
## 2566383 -0.08138720 8.334744 -3.626837 0.0003664051 0.9997102  0.01024134
## 2694314  0.10608542 6.813847  3.553376 0.0004772006 0.9997102 -0.21648884
## 3707258  0.06258903 6.911761  3.470792 0.0006390722 0.9997102 -0.46656387
## 3957224  0.08050569 6.324961  3.469708 0.0006415049 0.9997102 -0.46981262
## 3934479  0.07847492 6.465840  3.468142 0.0006450341 0.9997102 -0.47450352
## 3819771  0.21167767 5.350143  3.463788 0.0006549411 0.9997102 -0.48753481
## 3462630 -0.07185294 3.382152 -3.439265 0.0007134666 0.9997102 -0.56067593
## 4038494  0.10130047 3.711608  3.427094 0.0007442989 0.9997102 -0.59680772
results_PETposvneg <- topTable(fit4, n = Inf)
results_PETposvneg$probe_id <- rownames(results_PETposvneg)
signif_PET_pos_v_neg <- filter(results_PETposvneg, P.Value < 0.05)
dim(signif_PET_pos_v_neg)
## [1] 865   7

Now for apoe4

Group <- factor(exdata3$apoe4, levels = c(0, 1))
design3 <- model.matrix(~0 + Group)
colnames(design3) <- c("NEG", "POS")

Find differentially expressed genes

fit5 <- lmFit(exprs, design3)
cont.matrix3 <- makeContrasts(apoe4 = NEG - POS, levels = design3)
fit6 <- contrasts.fit(fit5, cont.matrix3)
fit6 <- eBayes(fit6)
topTable(fit6, adjust = "BH")
##               logFC  AveExpr         t      P.Value adj.P.Val           B
## 3360287  0.17135398 6.377782  4.107229 5.892379e-05 0.9997616  1.58861565
## 3453446  0.20354250 4.358425  3.702309 2.781176e-04 0.9997616  0.24693729
## 3275398  0.27781363 6.503623  3.643283 3.451692e-04 0.9997616  0.06124851
## 3749570  0.18071224 6.920419  3.505805 5.650003e-04 0.9997616 -0.36123353
## 3450180 -0.08479851 5.565530 -3.475845 6.278481e-04 0.9997616 -0.45142875
## 3719456  0.15291395 3.122911  3.416413 7.723809e-04 0.9997616 -0.62834664
## 3150579 -0.15573427 5.935585 -3.391778 8.409687e-04 0.9997616 -0.70089559
## 2777276  0.07471033 4.025781  3.335889 1.018232e-03 0.9997616 -0.86377471
## 2342904  0.09381612 5.328055  3.331353 1.034053e-03 0.9997616 -0.87689050
## 2432714 -0.20563079 6.724809 -3.232473 1.441337e-03 0.9997616 -1.15884979
results_apoe4posvneg <- topTable(fit6, n = Inf)
results_apoe4posvneg$probe_id <- rownames(results_apoe4posvneg)
signif_apoe4_pos_v_neg <- filter(results_apoe4posvneg, P.Value < 0.05)
dim(signif_apoe4_pos_v_neg)
## [1] 554   7

Create table of all significant differentially expressed genes

sig_direct_pairs <- full_join(
  dplyr::select(signif_PET_pos_v_neg, P.Value, probe_id),
  dplyr::select(signif_apoe4_pos_v_neg, P.Value, probe_id), 
  by = "probe_id") %>% 
  dplyr::select(probe_id, everything())
colnames(sig_direct_pairs)[2] <- "P.Value_PET"
colnames(sig_direct_pairs)[3] <- "P.Value_apoe4"
library(VennDiagram)
grid.newpage()
venn2 <- draw.pairwise.venn(
  area1 = length(which(!is.na(sig_direct_pairs$P.Value_PET))),
  area2 = length(which(!is.na(sig_direct_pairs$P.Value_apoe4))),
  cross.area = length(which(!is.na(sig_direct_pairs$P.Value_PET) & !is.na(sig_direct_pairs$P.Value_apoe4))),
  category = c("PET_pos_v_neg", "apoe4_pos_v_neg"),
  ext.text = FALSE, cat.prompts = T, cat.pos = 0)

## INFO [2018-02-28 15:49:55] Placing category labels at default outer locations.  Use 'cat.pos' and 'cat.dist' to modify location.
## INFO [2018-02-28 15:49:55] Current 'cat.pos': 0 degrees, 0 degrees
## INFO [2018-02-28 15:49:55] Current 'cat.dist': 0.025 , 0.025

2 SNP and eQTL analysis

Import IDs for genotyped SNPs and proxy SNPs (those in LD with the genotyped SNPs), and annotate them with gene names

install.packages("tidyverse")
source("https://bioconductor.org/biocLite.R")
biocLite()
biocLite("biomaRt")
biocLite("GenomicRanges")
library(GenomicRanges)
biocLite("Homo.sapiens")
library(Homo.sapiens)
biocLite("snpStats")
install.packages("MatrixEQTL")

2.1 Import files

aibl_snps <- read.delim("~/Dropbox/eQTL/Data/Expanded SNP set/AIBLgene_SNP_LIST_04032015.tsv")
snap_edit <- read.delim("~/Dropbox/eQTL/Data/Expanded SNP set/snapResults_edit.txt")
snap <- read.delim("~/Dropbox/eQTL/Data/Expanded SNP set/snapResults.txt")

2.2 Combine datasets

Note: ‘snap_edit’ has removed all SNPs from ‘snap’ that did not have proxies.

# How many SNPs are there in snap_edit?
length(unique(snap_edit$SNP)) # Shows that 2084 - 1527 = 557 have been removed from aibl_snps. 
## [1] 1527
# How many genes are there in aibl_snps?
length(unique(aibl_snps$GENE))
## [1] 298
# How many aibl_snps don't have an associated gene?
which(aibl_snps$GENE == "")
##  [1]  116  464  511  512  668  675  788  856  877  896  897  960 1342 1409
## [15] 1612 1989
# One entry has "-" in the gene column.
which(aibl_snps$GENE == "-") # shows that there is 1 additional SNP with no associated gene
## [1] 415
length(setdiff(aibl_snps$SNP, snap_edit$SNP)) # same number as above - 557 removed.
## [1] 557
# How many unique proxy SNPs?
length(unique(snap_edit$Proxy))
## [1] 18118
library(tidyverse)
AIBLdummy <- dplyr::select(aibl_snps, SNP, GENE) # creates dummy data frame with just two variables
snap2 <- left_join(snap_edit, AIBLdummy, by = "SNP") # combine data frames
rm(AIBLdummy) # remove dummy
snap2 <- dplyr::select(snap2, SNP, SNP_gene = "GENE", dplyr::everything())# reorder columns

2.3 Get gene names and locations for AIBL_snps

Get ensembl ids and loci from BioMart

snps <- as.vector(aibl_snps$SNP) # Create vector of aibl_snps
library(biomaRt)
listMarts()
mart_snps <- useMart('ENSEMBL_MART_SNP')
listDatasets(mart_snps) # lists datasets available
mart_snps <- useMart('ENSEMBL_MART_SNP', 'hsapiens_snp')
SNP_genes <- getBM(attributes = c("refsnp_id", "allele_1", "minor_allele", "minor_allele_freq", "chr_name", "chrom_start", "chrom_end", "ensembl_gene_stable_id"),
                     filters = "snp_filter",
                     values = snps,
                     mart = mart_snps)

See how many SNPs returned results

length(unique(SNP_genes$refsnp_id))
## [1] 2075

Use ‘ensembl_ids’ to get loci and hgnc symbols for genes

ensembl_ids <- unique(SNP_genes$ensembl_gene_stable_id) # Create `ensembl_ids' vector of unique ensembl gene ids from 'SNP_genes'
listMarts()
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL')
listDatasets(mart_genes)
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL', 'hsapiens_gene_ensembl')
ensembl_genes <- getBM(attributes = c("ensembl_gene_id", "chromosome_name", "start_position", "end_position", "strand", "hgnc_symbol", "entrezgene"),
                       filters = "ensembl_gene_id",
                       values = ensembl_ids,
                       mart = mart_genes)

Bind both tables by ensembl_id to give full table of genes associated with the AIBL snps

SNP_genes_full <- dplyr::left_join(SNP_genes, ensembl_genes, by = c("ensembl_gene_stable_id" = "ensembl_gene_id"))

Remove duplicate rows with “LRG…” in the ‘ensembl_gene_stable_id’ column

SNP_genes_full <- SNP_genes_full[-grep("LRG", SNP_genes_full$ensembl_gene_stable_id), ]
dim(SNP_genes_full)
## [1] 3492   14
length(unique(SNP_genes_full$refsnp_id))
## [1] 2075
length(unique(SNP_genes_full$ensembl_gene_stable_id))
## [1] 621
# SNP_genes_full$ensembl_gene_stable_id has blank entries - replace with 'NA'
SNP_genes_full$ensembl_gene_stable_id[SNP_genes_full$ensembl_gene_stable_id == ""] <- NA
length(unique(SNP_genes_full$refsnp_id[!is.na(SNP_genes_full$ensembl_gene_stable_id)]))
## [1] 1994
# do the same for hgnc_symbol
SNP_genes_full$hgnc_symbol <- as.character(SNP_genes_full$hgnc_symbol) # convert to character
SNP_genes_full$hgnc_symbol[SNP_genes_full$hgnc_symbol == ""] <- NA # replace blank values with NA

## write table
# write.table(SNP_genes_full, "~/Desktop/Uni Air/CSIRO/AD_project/Working/aibl_snp_genes.txt", sep = "\t", col.names = T, row.names = F, quote = F)

2.4 Get gene names and locations for proxy SNPs

Get ensembl ids and loci for proxies

proxy_snps <- as.vector(unique(snap_edit$Proxy)) # Create vector of proxy snps
length(proxy_snps)
listMarts()
mart_snps <- useMart('ENSEMBL_MART_SNP')
listDatasets(mart_snps) # lists datasets available
mart_snps <- useMart('ENSEMBL_MART_SNP', 'hsapiens_snp')

proxy_genes <- getBM(attributes = c("refsnp_id", "chr_name", "chrom_start", "chrom_end", "ensembl_gene_stable_id"),
                     filters = "snp_filter",
                     values = proxy_snps,
                     mart = mart_snps)

See how many SNPs returned results loci information

length(unique(proxy_genes$refsnp_id))
## [1] 17847

Remove rows with “LRG” in ‘ensemble_gene_stable_id’

proxy_genes <- proxy_genes[-grep("LRG", proxy_genes$ensembl_gene_stable_id), ]
dim(proxy_genes)
## [1] 24632     5

Create `ensembl_ids’ vector of unique ensembl gene ids from ‘proxy_genes’

ensembl_ids_proxies <- unique(proxy_genes$ensembl_gene_stable_id)
length(ensembl_ids_proxies)
## [1] 1103

Use ‘ensembl_ids’ to get loci and hgnc symbols for genes

listMarts()
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL')
listDatasets(mart_genes)
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL', 'hsapiens_gene_ensembl')
ensembl_proxy_genes <- getBM(attributes = c("ensembl_gene_id", "chromosome_name", "start_position", "end_position", "strand", "hgnc_symbol"),
                       filters = "ensembl_gene_id",
                       values = ensembl_ids_proxies,
                       mart = mart_genes)

Bind both tables by ensembl_id to give full table of genes associated with the proxy SNPs

proxy_genes_full <- left_join(proxy_genes, ensembl_proxy_genes, by = c("ensembl_gene_stable_id" = "ensembl_gene_id"))
dim(proxy_genes_full)
## [1] 24632    10
# substitute blank values in hgnc_symbol column with NA
proxy_genes_full$hgnc_symbol <- as.character(proxy_genes_full$hgnc_symbol) # convert to character
proxy_genes_full$hgnc_symbol[proxy_genes_full$hgnc_symbol == ""] <- NA # replace blank values with NA
proxy_gns_uq <- proxy_genes_full[is.na(proxy_genes_full$hgnc_symbol) == F, ] # create new data frame with SNPs that have been annotated with gene symbols

# get unique values
length(unique(proxy_gns_uq$refsnp_id))
## [1] 15255
length(unique(proxy_gns_uq$hgnc_symbol)) 
## [1] 795
## write table
# write.table(proxy_genes_full, "~/Desktop/Uni Air/CSIRO/AD_project/Working/proxy_snp_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")

2.5 Determine differences between genes in SNP and proxy lists

Find out how many unique genes there are in the SNP and proxy lists

length(unique(SNP_genes_full$ensembl_gene_stable_id))
## [1] 621
length(unique(proxy_genes_full$ensembl_gene_stable_id))
## [1] 1103

Find out how many proxy genes are different from the SNP genes, and vice versa

length(setdiff(SNP_genes_full$ensembl_gene_stable_id, proxy_genes_full$ensembl_gene_stable_id))
## [1] 127
length(setdiff(proxy_genes_full$ensembl_gene_stable_id, SNP_genes_full$ensembl_gene_stable_id))
## [1] 609

So 127/621 genes in the AIBL SNP list are not in the proxy list, and 609/1103 genes in the proxy list are not in the SNP list.


2.6 Read in genotype and expression data

library(readxl)
gtypes <- read_excel("~/Dropbox/eQTL/Data/Expanded SNP set/AIBLgene_SNP_Data_by_Gene_Expression_IDs.xlsx",
                col_names = F)
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)

Note that the AIBL IDs are incorrect in the excel file. We attach the correct IDs using the ‘AIBL_Gene_Expression_IDs_UpdtdDec2017.txt’ file.

ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")

# bind colnames for exdata to new IDs
ids2$old_ids <- as.integer(gsub("X", "", colnames(exdata)))

# reorder to same order as the SNP genotype data
ids2 <- ids2[order(ids2$old_ids), ]

# merge with rownames of genotype data to ensure perfect match
ids3 <- data.frame(gtypes[5:nrow(gtypes), 1])# extract AIBL IDs from genotype data
ids3$X__1 <- as.integer(ids3$X__1)
ids3 <- merge(ids3, ids2, by.x = "X__1", by.y = "old_ids")

# add new column names to the genotype data
ids2 <- as.vector(ids3$x)
gtypes$AIBL_ID_new <- c(NA, NA, NA, "AIBL_ID_new", as.vector(ids2))
gtypes <- dplyr::select(gtypes, AIBL_ID_new, dplyr::everything()) # reorder to bring AIBL_ID_new to the front
gts <- as.tibble(t(gtypes)) # transpose data frame
gts <- as.tibble(lapply(gts, function(x) { # convert all 'X_X' to NA
  gsub("X_X", NA, x)
}))

Because some individuals failed to record a result for some SNPs in addition to the few individuals for whom no results were recorded, we will manually remove the latter.

colnames(gts) <- gts[3, ] # make the column names the (incorrect) AILB IDs

# some individuals have data missing for ~1,500 of the ~2,000 SNPs, e.g.

length(which(is.na(gts$`127`)) == TRUE)
## [1] 1515
x <- sapply(gts, function(x) { # gets TRUE or FALSE for each column with data missing for more than 1000 SNPs
  length(which(is.na(x) == TRUE)) > 1000
})
y <- which(x == TRUE) # creates vector of column indexes for which it is true
gts[y] <- NULL # removes those columns

colnames(gts) <- c("chromosome", "position", "gene", "snp", gts[1, 5:ncol(gts)]) # replace column names with correct AIBL IDs

gts <- gts[-c(1:3), ] # remove extraneous first three rows

Write table to working folder

# write.table(gts, file = "~/Desktop/Uni Air/CSIRO/AD_project/Working/genotype_data.txt", sep = "\t")

2.9 eQTL analysis for whole cohort

Read in data.

exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2
meta <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T, sep = "\t")

2.9.1 Put into format required by the MatrixEQTL package.

2.9.1.1 Covariates file

# create binary statuses for sex and PET
meta$Demographic.Sex <- ifelse(meta$Demographic.Sex == "Male", 0, 
                               ifelse(meta$Demographic.Sex == "Female", 1, NA))
meta$PET <- ifelse(meta$PET == "POS", 1,
                   ifelse(meta$PET == "NEG", 0, NA))

# tidy metadata
meta <- data.frame(t(meta), stringsAsFactors = F)
colnames(meta) <- meta[1, ]
meta$id <- rownames(meta)
rownames(meta) <- seq(1:nrow(meta))
meta <- dplyr::select(meta, id, dplyr::everything())
meta <- meta[-1, ]

# tidy probe data
exdata$probe_id <- rownames(exdata)
exdata <- dplyr::select(exdata, probe_id, dplyr::everything())
rownames(exdata) <- seq(1:nrow(exdata))

2.9.1.2 Genotype file

# tidy genotype data
ped6 <- ped[, -c(1:3)]
ped6 <- left_join(ped6, dplyr::select(snp_gns[!duplicated(snp_gns$refsnp_id), ], refsnp_id, allele_1, minor_allele), by = c("snp" = "refsnp_id")) %>% 
  dplyr::select(snp, allele_1, minor_allele, everything())
ped6 <- data.frame(lapply(ped6, function(x) gsub("_", " ", x)),
                   stringsAsFactors = F)

Noticed that for some SNPs the alleles annotated with BioMart differ from the alleles present in our data (e.g. “rs1064261” is annotated as having A and G alleles, but our data shows T and C), and some SNPs are annotated as having the same major and minor alleles (e.g. “rs3753584” which has C for both its major and minor allele).

We will therefore get another plink output to find calculate which alleles are major and minor when they differ from the BioMart annotations, or when the BioMart annotations seem incorrect. We used the code: ./plink --file snp_data --map3 --noweb --no-fid --no-parents --fisher --out allele_freq

# import plink data on allele frequencies
pk_frq <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/allele_freq.assoc.fisher", na.strings = NA, header = T, fill = T)
pk_frq <- dplyr::select(pk_frq, snp = SNP, pk_major_allele = A2, pk_minor_allele = A1)

# create comparison data frame
comp <- data.frame(left_join(dplyr::select(ped6, snp, allele_1, minor_allele), pk_frq, by = "snp") %>% 
  dplyr::select(snp, allele_1, pk_major_allele, minor_allele, pk_minor_allele), stringsAsFactors = F)
comp <- data.frame(sapply(comp, as.character), stringsAsFactors = F)

# create empty columns for correct allele calls
comp$cor_mj <- NA
comp$cor_mn <- NA

# substitute empty values with NA
comp[, 2:5] <- sapply(comp[, 2:5], function(x) gsub("0", NA, x))
comp[, 2:5] <- sapply(comp[, 2:5], function(x) gsub("^$|^ $", NA, x))

# add correct major allele column
comp$cor_mj <- unlist(apply(comp, 1, function(x){
  if (is.na(x[2]) == T & is.na(x[3]) == T){ # if both are NA, return NA
    x[6] <- NA
  } else if (x[2] == x[5] & x[4] == x[3] & is.na(x[2] == F) & is.na(x[3] == F)){ # if annotated and plink alleles are swapped around, prefer annotated alleles
    x[6] <- x[2]  
  } else if (is.na(x[2]) == T & is.na(x[3] == F)){ # if only plink output has an allele, use that
    x[6] <- x[3]
  } else if (is.na(x[2]) == F & is.na(x[3] == T)){ # if only the annotation has an allele, use that
    x[6] <- x[2]
  } else if (x[2] == x[3] & is.na(x[2] == F) & is.na(x[3] == F)) { # if the annotated and plink alleles are the same, just use the annotated allele
    x[6] <- x[2]
  } else { # otherwise use the plink allele
    x[6] <- x[3]
  }
}))

# add correct minor allele column
comp$cor_mn <- unlist(apply(comp, 1, function(x){
    if (is.na(x[4]) == T & is.na(x[5]) == T){
    x[7] <- NA
  } else if (x[2] == x[5] & x[4] == x[3] & is.na(x[2] == F) & is.na(x[3] == F)){
    x[7] <- x[4]
  } else if (is.na(x[4]) == T & is.na(x[5] == F)){
    x[7] <- x[5]
  } else if (is.na(x[4]) == F & is.na(x[5] == T)){
    x[7] <- x[4]
  } else if (x[4] == x[5] & is.na(x[4]) == F & is.na(x[5]) == F) {
    x[7] <- x[4]
  } else {
    x[7] <- x[5]
  }
}))

Incorporate adjusted major and minor allele into ped6 and create genotype where: * 0 = homozygous major allele * 1 = heterozygous * 2 = homozygous minor allele

ped6$allele_1 <- comp$cor_mj
ped6$minor_allele <- comp$cor_mn

out <- t(data.frame(apply(ped6, 1, function(x){ ## WORKS - need to change the second function from sapply to lapply
  a <- paste(x[2], x[2], sep = " ")
  b <- paste(x[2], x[3], sep = " ")
  c <- paste(x[3], x[2], sep = " ")
  d <- paste(x[3], x[3], sep = " ")
  results <- lapply(x[4:length(x)], function(y){
    if (is.na(y) == T){
      y <- NA
    }
    else if (y == a & is.na(y) == F){
      y <- 0
    }
    else if (y == b | y == c & is.na(y) == F){
      y <- 1
    } 
    else if (y == d & is.na(y) == F){
      y <- 2
    }
  })
  data.frame(do.call(rbind, results))
})))

# bind snp info to genotype coding
ped7 <- data.frame(cbind(ped6[, 1], out), stringsAsFactors = F)
rownames(ped7) <- seq(1:nrow(ped7))
ped7 <- dplyr::rename(ped7, snpid = V1)
colnames(ped7) <- gsub("X", "", colnames(ped7))

2.9.1.3 Expression file

exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2

# extract only those columns (samples) for which we have genotype data
exdata <- exdata[, colnames(exdata) %in% colnames(ped7)]

# create column for probe ID and reorder
exdata$probe_id <- rownames(exdata)
exdata <- dplyr::select(exdata, probe_id, everything())

Finalise covariates file by extracting only those columns (samples) for which we have genotype data

cns <- c("id", colnames(meta)[colnames(meta) %in% colnames(ped7)])
meta <- meta[, cns]

Write files

# write.table(ped7, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/snps.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# write.table(meta, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# write.table(exdata, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/expr.txt", col.names = T, row.names = F, quote = F, sep = "\t")

2.9.2 Set up parameters for eQTL

library(MatrixEQTL)
useModel = modelLINEAR
SNP_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/snps.txt"
expression_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/expr.txt"
covariates_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt"
output_file_name = tempfile()

pvOutputThreshold = 1e-2
errorCovariance = numeric()

Load snps data

snps = SlicedData$new()
snps$fileDelimiter = "\t"
snps$fileOmitCharacters = "NA"
snps$fileSkipRows = 1
snps$fileSkipColumns = 1
snps$fileSliceSize = 2000
snps$LoadFile( SNP_file_name )

Load gene data

gene = SlicedData$new()
gene$fileDelimiter = "\t"
gene$fileOmitCharacters = "NA"
gene$fileSkipRows = 1
gene$fileSkipColumns = 1
gene$fileSliceSize = 2000
gene$LoadFile( expression_file_name )

Load covariate data

cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"
cvrt$fileOmitCharacters = "NA"
cvrt$fileSkipRows = 1
cvrt$fileSkipColumns = 1
cvrt$fileSliceSize = 2000
cvrt$LoadFile( covariates_file_name )

2.9.3 Run Matrix eQTL analysis

 me = Matrix_eQTL_engine(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name,
    pvOutputThreshold = pvOutputThreshold,
    useModel = useModel, 
    errorCovariance = errorCovariance, 
    verbose = TRUE,
    pvalue.hist = TRUE,
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)

2.9.4 Pull out significant SNP-gene relationships

# create table of annotated significant genes (from original limma)
sig_PET_pos_v_neg <- left_join(signif_PET_pos_v_neg, pid_all, by = "probe_id")

# read in annotated probe table
pid_all <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/affy_probes_annotated.txt", header = T, sep = "\t", stringsAsFactors = F)

# pull out eQTLs from output
eqtls <- me[["all"]][["eqtls"]]

# pull out significant SNP-gene relationships (FDR < 0.05)
sig_eqtls <- eqtls[eqtls$FDR < 0.05, ]
dim(sig_eqtls)
## [1] 3055    6
# number of unique SNPs
length(unique(sig_eqtls$snps))
## [1] 130
# number of unique probes
length(unique(sig_eqtls$gene))
## [1] 2976
# annotate
sig_eqtls$snps <- as.character(sig_eqtls$snps)
sig_eqtls$gene <- as.character(sig_eqtls$gene)
pid_all$probe_id <- as.character(pid_all$probe_id)
sig_eqtl_snps <- dplyr::left_join(sig_eqtls, snp_gns, by = c("snps" = "refsnp_id"))
sig_eqtl_gns <- dplyr::left_join(sig_eqtls, pid_all, by = c("gene" = "probe_id"))

## write files
# write.table(sig_eqtl_snps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/eqtl_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# write.table(sig_eqtl_gns, "~/Desktop/Uni Air/CSIRO/AD_project/Working/eqtl_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")

# find any hits with the genes implicated in the 2018 study
nrow(sig_eqtl_snps[grep("PICALM|BIN1|CR1|ABCA7|MS4A6A|CLU|FERMT2|DSG2|EPHA1|SORL1|ZCWPW1", sig_eqtl_snps$hgnc_symbol), ])
## [1] 20
nrow(sig_eqtl_gns[grep("PICALM|BIN1|CR1|ABCA7|MS4A6A|CLU|FERMT2|DSG2|EPHA1|SORL1|ZCWPW1", sig_eqtl_gns$hgnc_symbol), ])
## [1] 0
# see whether any of the SNP genes intersect with the expression genes (i.e. are any of them acting cis?)
eqtl_snp_gns <- as.vector(na.omit(unique(sig_eqtl_snps$hgnc_symbol)))
length(eqtl_snp_gns)
## [1] 101
eqtl_de_gns <- as.vector(na.omit(unique(sig_eqtl_gns$gene_symbol)))
length(eqtl_de_gns)
## [1] 2246
intersect(eqtl_snp_gns, eqtl_de_gns)
## [1] "HSD11B2" "CYP21A2" "ACE"     "TXNL4B"  "ICAM5"   "IDE"     "SLC30A3"
## [8] "SGMS1"

130 significant AIBL SNPs in 101 annotated genes, affecting the expression levels of 2242 genes. Only eight of them are shared between the SNP genes and the differentially expressed genes, meaning that only those appear to be acting cis.


2.10 eQTL analysis separately for PET-pos and -neg groups

2.10.1 Find out which AIBL IDs match with PET pos and neg status and create covariates file for them

# create covariates file for pos status
PET_pos_covs <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt", sep = "\t", header = T)
PET_pos_covs <- data.frame(t(PET_pos_covs), stringsAsFactors = F)
colnames(PET_pos_covs) <- PET_pos_covs[1, ]
PET_pos_covs$aibl_id <- gsub("X", "", rownames(PET_pos_covs))
PET_pos_covs$PET <- as.integer((PET_pos_covs$PET))
PET_pos_covs <- dplyr::filter(PET_pos_covs, PET == 1)
PET_pos_covs <- data.frame(t(PET_pos_covs), stringsAsFactors = F)
colnames(PET_pos_covs) <- PET_pos_covs[nrow(PET_pos_covs), ]
PET_pos_covs <- PET_pos_covs[-c(2, nrow(PET_pos_covs)), ] # remove PET and id rows
PET_pos_covs$id <- rownames(PET_pos_covs)
PET_pos_covs <- dplyr::select(PET_pos_covs, id, everything())
rownames(PET_pos_covs) <- NULL
# write.table(PET_pos_covs, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_covs.txt", col.names = T, row.names = F, quote = F, sep = "\t")

# create covariates file for neg status
PET_neg_covs <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt", sep = "\t", header = T)
PET_neg_covs <- data.frame(t(PET_neg_covs), stringsAsFactors = F)
colnames(PET_neg_covs) <- PET_neg_covs[1, ]
PET_neg_covs$aibl_id <- gsub("X", "", rownames(PET_neg_covs))
PET_neg_covs$PET <- as.integer((PET_neg_covs$PET))
PET_neg_covs <- dplyr::filter(PET_neg_covs, PET == 0)
PET_neg_covs <- data.frame(t(PET_neg_covs), stringsAsFactors = F)
colnames(PET_neg_covs) <- PET_neg_covs[nrow(PET_neg_covs), ]
PET_neg_covs <- PET_neg_covs[-c(2, nrow(PET_neg_covs)), ]
PET_neg_covs$id <- rownames(PET_neg_covs)
PET_neg_covs <- dplyr::select(PET_neg_covs, id, everything())
rownames(PET_neg_covs) <- NULL
# write.table(PET_neg_covs, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_covs.txt", col.names = T, row.names = F, quote = F, sep = "\t")

2.10.2 Divide out expression data

## Get expression data
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2
exdata <- exdata[, colnames(exdata) %in% colnames(ped7)] # extract only those columns (samples) for which we have genotype data

# for POS group
pos_exdata <- exdata[, colnames(exdata) %in% colnames(PET_pos_covs)]
pos_exdata$probe_id <- rownames(pos_exdata) # create column for probe ID and reorder
pos_exdata <- dplyr::select(pos_exdata, probe_id, everything())
# write.table(pos_exdata, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_expr.txt", col.names = T, row.names = F, quote = F, sep = "\t")

# for NEG group
neg_exdata <- exdata[, colnames(exdata) %in% colnames(PET_neg_covs)]
neg_exdata$probe_id <- rownames(neg_exdata) # create column for probe ID and reorder
neg_exdata <- dplyr::select(neg_exdata, probe_id, everything())
# write.table(neg_exdata, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_expr.txt", col.names = T, row.names = F, quote = F, sep = "\t")

2.10.3 Divide out genotype data

# read in data
gtps <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/snps.txt", header = T, sep = "\t")
colnames(gtps) <- gsub("X", "", colnames(gtps))

# for POS group
pet_pos_gtps <- gtps[, colnames(gtps) %in% colnames(PET_pos_covs)]
pet_pos_gtps <- cbind(snpid = gtps[, 1], pet_pos_gtps)
# write.table(pet_pos_gtps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_gtps.txt", col.names = T, row.names = F, quote = F, sep = "\t")

# for NEG group
pet_neg_gtps <- gtps[, colnames(gtps) %in% colnames(PET_neg_covs)]
pet_neg_gtps <- cbind(snpid = gtps[, 1], pet_neg_gtps)
# write.table(pet_neg_gtps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_gtps.txt", col.names = T, row.names = F, quote = F, sep = "\t")

2.10.4 eQTL for POS group

Set up parameters for eQTL

library(MatrixEQTL)
useModel = modelLINEAR
SNP_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_gtps.txt"
expression_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_expr.txt"
covariates_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_covs.txt"
output_file_name = tempfile()

pvOutputThreshold = 1e-2
errorCovariance = numeric()

Load snps data

snps = SlicedData$new()
snps$fileDelimiter = "\t"
snps$fileOmitCharacters = "NA"
snps$fileSkipRows = 1
snps$fileSkipColumns = 1
snps$fileSliceSize = 2000
snps$LoadFile( SNP_file_name )

Load gene data

gene = SlicedData$new()
gene$fileDelimiter = "\t"
gene$fileOmitCharacters = "NA"
gene$fileSkipRows = 1
gene$fileSkipColumns = 1
gene$fileSliceSize = 2000
gene$LoadFile( expression_file_name )

Load covariate data

cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"
cvrt$fileOmitCharacters = "NA"
cvrt$fileSkipRows = 1
cvrt$fileSkipColumns = 1
cvrt$fileSliceSize = 2000
cvrt$LoadFile( covariates_file_name )

Run Matrix eQTL on PET-pos group

 me_pos = Matrix_eQTL_engine(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name,
    pvOutputThreshold = pvOutputThreshold,
    useModel = useModel, 
    errorCovariance = errorCovariance, 
    verbose = TRUE,
    pvalue.hist = TRUE,
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)

2.10.5 eQTL for NEG group

Set up parameters for eQTL

library(MatrixEQTL)
useModel = modelLINEAR
SNP_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_gtps.txt"
expression_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_expr.txt"
covariates_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_covs.txt"
output_file_name = tempfile()

pvOutputThreshold = 1e-2
errorCovariance = numeric()

Load snps data

snps = SlicedData$new()
snps$fileDelimiter = "\t"
snps$fileOmitCharacters = "NA"
snps$fileSkipRows = 1
snps$fileSkipColumns = 1
snps$fileSliceSize = 2000
snps$LoadFile( SNP_file_name )

Load gene data

gene = SlicedData$new()
gene$fileDelimiter = "\t"
gene$fileOmitCharacters = "NA"
gene$fileSkipRows = 1
gene$fileSkipColumns = 1
gene$fileSliceSize = 2000
gene$LoadFile( expression_file_name )

Load covariate data

cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"
cvrt$fileOmitCharacters = "NA"
cvrt$fileSkipRows = 1
cvrt$fileSkipColumns = 1
cvrt$fileSliceSize = 2000
cvrt$LoadFile( covariates_file_name )

Run Matrix eQTL on PET-neg group

 me_neg = Matrix_eQTL_engine(
    snps = snps,
    gene = gene,
    cvrt = cvrt,
    output_file_name = output_file_name,
    pvOutputThreshold = pvOutputThreshold,
    useModel = useModel, 
    errorCovariance = errorCovariance, 
    verbose = TRUE,
    pvalue.hist = TRUE,
    min.pv.by.genesnp = FALSE,
    noFDRsaveMemory = FALSE)

2.10.6 Pull out significant SNP-gene relationships for POS group

# pull out eQTLs from output
pos_eqtls <- me_pos[["all"]][["eqtls"]]

# pull out significant SNP-gene relationships (FDR < 0.05)
pos_sig_eqtls <- pos_eqtls[pos_eqtls$FDR < 0.05, ]
dim(pos_sig_eqtls)
## [1] 121   6
# annotate
pos_sig_eqtl_snps <- left_join(pos_sig_eqtls, snp_gns, by = c("snps" = "refsnp_id"))
pos_sig_eqtl_gns <- left_join(pos_sig_eqtls, pid_all, by = c("gene" = "probe_id"))

## write files
# write.table(pos_sig_eqtl_snps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/pos_eqtl_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# write.table(pos_sig_eqtl_gns, "~/Desktop/Uni Air/CSIRO/AD_project/Working/pos_eqtl_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")

# number of unique SNPs
length(unique(pos_sig_eqtl_snps$snps))
## [1] 28
# number of unique genes the SNPs sit within (note that it includes one NA)
length(unique(as.character(pos_sig_eqtl_snps$hgnc_symbol)))
## [1] 32
# number of unique differentially expressed genes (note that it includes one NA)
length(unique(pos_sig_eqtl_gns$gene_symbol))
## [1] 49
# create vectors of gene names for snps and differentially expressed genes
pos_eqtl_snp_gns <- as.vector(na.omit(unique(pos_sig_eqtl_snps$hgnc_symbol)))
pos_eqtl_snp_gns
##  [1] "MIR6796"  "PLD3"     "FYCO1"    "MIR6165"  "NGFR"     "NEDD9"   
##  [7] "CLU"      "TRDMT1"   "SGMS1"    "HP"       "HPR"      "TXNL4B"  
## [13] "NPY4R2"   "APOC1"    "OTC"      "XRCC3"    "KLC1"     "ATAT1"   
## [19] "C6orf136" "LRP8"     "DNMT3B"   "DNMT1"    "CYP11B2"  "GML"     
## [25] "MEI1"     "AKR1C3"   "CD33"     "PPARA"    "TFRC"     "NPC1"    
## [31] "C18orf8"
pos_eqtl_de_gns <- as.vector(na.omit(unique(pos_sig_eqtl_gns$gene_symbol)))
pos_eqtl_de_gns
##  [1] "ADGRG7"  "RBM46"   "DLGAP5"  "TTK"     "HMMR"    "FST"     "KIF15"  
##  [8] "BUB1"    "ASPM"    "PBK"     "MCM10"   "KIF14"   "POLQ"    "MND1"   
## [15] "CDC6"    "CEP55"   "CLSPN"   "DTL"     "CCNB2"   "CCNA2"   "PCLAF"  
## [22] "CCNB1"   "KIF11"   "CENPF"   "MCM4"    "MKI67"   "EXO1"    "GLYATL2"
## [29] "NUF2"    "ANLN"    "KIF23"   "CA1"     "STIL"    "CDCA2"   "APOD"   
## [36] "PLK1"    "TRIP13"  "GTF3C3"  "GFM2"    "BIRC5"   "CDC45"   "NCAPH"  
## [43] "AHSP"    "NCAPG"   "RRM2"    "E2F8"    "OR1E2"   "ART1"
# find which ones are shared
intersect(pos_eqtl_snp_gns, pos_eqtl_de_gns)
## character(0)

28 SNPs in 31 unique genes (32 less NA) affecting 48 unique genes (49 less NA). None of those genes are shared.

2.10.7 Pull out significant SNP-gene relationships for NEG group

# pull out eQTLs from output
neg_eqtls <- me_neg[["all"]][["eqtls"]]

# pull out significant SNP-gene relationships (FDR < 0.05)
neg_sig_eqtls <- neg_eqtls[neg_eqtls$FDR < 0.05, ]
dim(neg_sig_eqtls)
## [1] 160   6
head(neg_sig_eqtls)
##        snps    gene statistic       pvalue          FDR     beta
## 1 rs5030400 2633737  30.58395 9.851831e-44 4.302277e-36 3.793769
## 2    rs1937 3343900  17.69020 2.686827e-28 5.866663e-21 1.995096
## 3    rs1937 3514711  16.75808 6.696144e-27 9.747314e-20 2.041898
## 4    rs1937 3475130  14.92243 5.167194e-24 5.641261e-17 1.259226
## 5    rs1937 2557585  14.17949 8.604028e-23 7.514728e-16 1.884516
## 6    rs1937 3712197  11.93355 6.458826e-19 4.700930e-12 1.313909
# number of unique SNPs
length(unique(neg_sig_eqtls$snps))
## [1] 35
# annotate
neg_sig_eqtl_snps <- left_join(neg_sig_eqtls, snp_gns, by = c("snps" = "refsnp_id"))
neg_sig_eqtl_gns <- left_join(neg_sig_eqtls, pid_all, by = c("gene" = "probe_id"))

## write files
# write.table(neg_sig_eqtl_snps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/neg_eqtl_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# write.table(neg_sig_eqtl_gns, "~/Desktop/Uni Air/CSIRO/AD_project/Working/neg_eqtl_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")

# create vectors of gene names for snps and differentially expressed genes
neg_eqtl_snp_gns <- as.vector(na.omit(unique(neg_sig_eqtl_snps$hgnc_symbol)))
length(neg_eqtl_snp_gns)
## [1] 34
neg_eqtl_snp_gns
##  [1] "ICAM5"       "ICAM1"       "ICAM4"       "TFAM"        "PPP3CB"     
##  [6] "EPHA1"       "MME"         "PPARA"       "NPC1"        "MRPS18B"    
## [11] "PTMAP1"      "ATAT1"       "MTHFD1L"     "DNAJC5G"     "SLC30A3"    
## [16] "BIN1"        "MICAL1"      "PYY"         "BEST1"       "FTH1"       
## [21] "CR1"         "CDKN2B-AS1"  "CLU"         "IL6"         "CYP11B2"    
## [26] "GML"         "HSD17B3-AS1" "HSD17B3"     "PSAP"        "EPC2"       
## [31] "ACO1"        "UCP3"        "MTHFR"       "C1orf167"
neg_eqtl_de_gns <- as.vector(na.omit(unique(neg_sig_eqtl_gns$gene_symbol)))
length(neg_eqtl_de_gns)
## [1] 74
neg_eqtl_de_gns
##  [1] "ADGRG7"    "CTAGE3P"   "EFHC1"     "UTS2"      "L1TD1"    
##  [6] "TBRG1"     "ATP5L"     "CHRDL2"    "FAM20A"    "SLTM"     
## [11] "TUG1"      "SNRPA1"    "ERP27"     "IL31RA"    "HCLS1"    
## [16] "RBM25"     "PRRG1"     "PPP6R3"    "RUNDC3B"   "ENTPD4"   
## [21] "ZFC3H1"    "DPP10"     "CDK12"     "MED4"      "MYSM1"    
## [26] "HOXA1"     "R3HDM2"    "SECISBP2"  "EZH1"      "HINT1"    
## [31] "NEK1"      "OR1C1"     "ZMYM2"     "RB1CC1"    "RSRC2"    
## [36] "KIAA1468"  "GSN"       "ZBTB1"     "SLC26A5"   "TBC1D15"  
## [41] "SRSF5"     "RPS15A"    "CNTRL"     "DUSP19"    "MDM4"     
## [46] "LUC7L3"    "PDE3B"     "FN1"       "NAPG"      "IFNAR2"   
## [51] "ILF3-AS1"  "POLI"      "PDCD4"     "MGEA5"     "ZFP1"     
## [56] "NSD3"      "PNISR"     "TRA2A"     "SERPINI2"  "STN1"     
## [61] "TIGD7"     "PAN3"      "ESM1"      "GGNBP2"    "XPOT"     
## [66] "NEMF"      "AADACL2"   "HIST1H2BE" "LYAR"      "ACAT2"    
## [71] "NAP1L2"    "NKTR"      "RFC1"      "TTC17"
# find which ones are shared
intersect(neg_eqtl_snp_gns, neg_eqtl_de_gns)
## character(0)

35 SNPs in 34 unique genes affecting the expression of 74 unique genes. None of those genes are shared (i.e. all the SNPs are acting cis.

2.10.8 Compare which genes are differentially expressed eQTL genes between the two groups

setdiff(pos_eqtl_de_gns, neg_eqtl_de_gns)
##  [1] "RBM46"   "DLGAP5"  "TTK"     "HMMR"    "FST"     "KIF15"   "BUB1"   
##  [8] "ASPM"    "PBK"     "MCM10"   "KIF14"   "POLQ"    "MND1"    "CDC6"   
## [15] "CEP55"   "CLSPN"   "DTL"     "CCNB2"   "CCNA2"   "PCLAF"   "CCNB1"  
## [22] "KIF11"   "CENPF"   "MCM4"    "MKI67"   "EXO1"    "GLYATL2" "NUF2"   
## [29] "ANLN"    "KIF23"   "CA1"     "STIL"    "CDCA2"   "APOD"    "PLK1"   
## [36] "TRIP13"  "GTF3C3"  "GFM2"    "BIRC5"   "CDC45"   "NCAPH"   "AHSP"   
## [43] "NCAPG"   "RRM2"    "E2F8"    "OR1E2"   "ART1"
length(setdiff(pos_eqtl_de_gns, neg_eqtl_de_gns))
## [1] 47
setdiff(neg_eqtl_de_gns, pos_eqtl_de_gns)
##  [1] "CTAGE3P"   "EFHC1"     "UTS2"      "L1TD1"     "TBRG1"    
##  [6] "ATP5L"     "CHRDL2"    "FAM20A"    "SLTM"      "TUG1"     
## [11] "SNRPA1"    "ERP27"     "IL31RA"    "HCLS1"     "RBM25"    
## [16] "PRRG1"     "PPP6R3"    "RUNDC3B"   "ENTPD4"    "ZFC3H1"   
## [21] "DPP10"     "CDK12"     "MED4"      "MYSM1"     "HOXA1"    
## [26] "R3HDM2"    "SECISBP2"  "EZH1"      "HINT1"     "NEK1"     
## [31] "OR1C1"     "ZMYM2"     "RB1CC1"    "RSRC2"     "KIAA1468" 
## [36] "GSN"       "ZBTB1"     "SLC26A5"   "TBC1D15"   "SRSF5"    
## [41] "RPS15A"    "CNTRL"     "DUSP19"    "MDM4"      "LUC7L3"   
## [46] "PDE3B"     "FN1"       "NAPG"      "IFNAR2"    "ILF3-AS1" 
## [51] "POLI"      "PDCD4"     "MGEA5"     "ZFP1"      "NSD3"     
## [56] "PNISR"     "TRA2A"     "SERPINI2"  "STN1"      "TIGD7"    
## [61] "PAN3"      "ESM1"      "GGNBP2"    "XPOT"      "NEMF"     
## [66] "AADACL2"   "HIST1H2BE" "LYAR"      "ACAT2"     "NAP1L2"   
## [71] "NKTR"      "RFC1"      "TTC17"
length(setdiff(neg_eqtl_de_gns, pos_eqtl_de_gns))
## [1] 73
intersect(pos_eqtl_de_gns, neg_eqtl_de_gns)
## [1] "ADGRG7"

So all differentially expressed genes in each group are unique to that group other than ADGRG7.


2.11 Summary tables

SNP annotation
AIBL SNPs Proxy SNPs
Total genotyped 2084
No. in LD 1,527 18,118
Loci identified 2075 17,847
Genes 621 1,103
Unique genes (shared genes) 127 (494) 609 (494)
PLINK association analysis
No. significant SNPs (p < 0.05) Adjusted for multiple testing No. genes mapped to the significant SNPs
Association (chi square, Fisher exact) 72 0 (from 10,000 permutations) 58
Logistic regression with covariates 83 0 (< 0.05 FDR-adjusted) 59
Intersection between above analyses 43 N/A 47
eQTL analysis
No. significant SNPs (p < 0.05 FDR-adjusted) No. differentially expressed genes No. of SNPs acting in cis
Total cohort 130 SNPs mapped to 101 genes 2246 8
AB-pos only 28 SNPs mapped to 31 genes 48 0
AB-neg only 35 SNPs mapped to 34 genes 74 0

All differentially expressed genes in the AB-pos and -neg groups were unique to that group other than ADGRG7.


3 Enriched network decomposition (END) analysis

3.1 END using just the differentially expressed genes in networks

Load data

require(limma)
load("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_expression.RData")

Get END eigenvalues from ‘cluster_genexp’

require(limma)
load("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_expression.RData")
ll <- lapply(cluster_genexp, function (x) {
  xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
  svd.decomp2 <- svd(xm) # applies singular value decomposition
  svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
  X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
  xout <-  X.pca[,1] # takes the first eigemvalue for each individual
  return(xout)
})

Now each of the elements in the ‘ll’ list is a vector of 218 eigenvalues (one for each individual). Each element corresponds to one of the 244 networks.

Turn it into a data frame

lld <- do.call(rbind.data.frame,ll) # creates a data frame of eigenvalues
lld2 <- t(lld) # transpose so that individuals are rows
rownames(lld2) <- cluster_genexp[[1]][,1]
colnames(lld2) <- names(cluster_genexp)

Add metadata and gene/cluster annotation info

genemap <- read.csv("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_hsa_map.csv") # note that 5 networks are missing from 'cluster_genexp' - why?
keymeta <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T)
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data

# number of unique networks
length(unique(genemap$hsa))
## [1] 249
# nubmer of unique root probe IDs
length(unique(genemap$root_gene_probe_id))
## [1] 104
lld3 <- lld2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data

Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
lld4 <- lld3[kp,] # remove individuals that do not have PET status recorded
lld4 <- lld4[-c(grep(c("771|914|918"), rownames(lld4))), ] # remove outliers identified with PCA on expression data
treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
design_treat1<-model.matrix(~0+treat1) # create model matrix
colnames(design_treat1) <- c("Normal","Alzheimers")
contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast

Visualise eigenvalues to see if they separate the groups

require(tidyverse)
lld4.1 <- data.frame(lld4)
lld4.1$AIBL.Id <- as.integer(rownames(lld4.1))
vis <- left_join(lld4.1, keymeta1, by = "AIBL.Id")
vis <- gather(vis, key = "network", value = "eigenvalue", 1:244)
ggplot(vis, aes(network, eigenvalue, colour = PET)) +
  geom_jitter() # most networks seeem to have a pretty even distribution between POS and NEG groups

Run limma

# Transpose data into matrix for limma fit
x1 <- t(lld4)
fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
fit.eb1<- eBayes(fit1.c) # compute Bayes statistics

## comparison treated - untreated
topALL1<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=244, genelist=colnames(lld4))
topALL.1 <- topALL1[,c("ID","P.Value", "adj.P.Val")]

Many of the genes in the cluster_genexp data set weren’t differentially expressed in the inital limma. Let’s filter these and then run the END on that reduced set.

Create vector of unique significant genes in ‘cluster_genexp’ and annotate with p-values from original limma

# create vector of all probe IDs that sit within clusters
clst_gns <- lapply(cluster_genexp, function (x) {
  names(x)[-1]
})

# take only unique probe IDs and remove "X" prefix
clst_gns_1 <- unique(as.character(unlist(clst_gns)))
clst_gns_1 <- gsub("X", "", clst_gns_1)
length(clst_gns_1)
## [1] 3434
# create data frame of probe IDs that sit within clusters, with their associated outputs from the original limma
lim1_out <- results_PETposvneg[results_PETposvneg$probe_id %in% clst_gns_1, ]
nrow(lim1_out)
## [1] 3434

So 3434 unique probe IDs sit within the clusters.

Get p-values from original limma for all root genes

root_lim <- results_PETposvneg[results_PETposvneg$probe_id %in% genemap$root_gene_probe_id, ] # take the expression data for root genes
root_lim <- dplyr::select(root_lim, root_probe_id = "probe_id", root_de_p.value = "P.Value", root_de_p.value_adj = "adj.P.Val") # select and rename columns of interest

# convert root_gene_probe_id to character for merging
genemap$root_gene_probe_id <- as.character(genemap$root_gene_probe_id)

# create consolidated table with initial limma results ("root_de_p.value"), cluster info, and END cluster limma results ("end_cluster_p.value")
root_lim <- dplyr::left_join(root_lim, 
                             dplyr::select(genemap, 
                                           root_probe_id = "root_gene_probe_id", 
                                           root_gene_symbol, 
                                           cluster_id = "hsa", 
                                           cluster_name = "name"),
                             by = "root_probe_id") %>% 
  left_join(dplyr::select(topALL1,
                          cluster_id = "ID",
                          end_cluster_p.value = "P.Value",
                          end_cluster_p.value_adj = "adj.P.Val"),
            by = "cluster_id")
## Warning: Column `cluster_id` joining factor and character vector, coercing
## into character vector
# for how many networks does the END limma have a lower p-value than the limma on the root gene?
length(which(root_lim$end_cluster_p.value < root_lim$root_de_p.value))
## [1] 24

So for 24 networks, the p-value from the limma on the END was lower than the p-value for the differential expression of the network’s root gene alone.

lim1_out_sig <- dplyr::filter(lim1_out, P.Value < 0.05) # filter for p-values < 0.05
sig_lim <- unlist(lim1_out_sig$probe_id) # create vector of significant probes
length(sig_lim)
## [1] 168
# remove the "X" from before all the probe names in the 'cluster_genexp' list
clst_tidy <- lapply(cluster_genexp, function (x) {
  a <- c(names(x)[1], gsub("X", "", names(x)[-1]))
  names(x) <- a
  return(x)
})

# shows how many genes in each network were found to be differentially expressed in the initial limma
clst_tidy_1 <- lapply(clst_tidy, function(x) {
  length(which(names(x) %in% sig_lim) == T)
})

# take all clusters with more than one gene that was found to be differentially expressed in the initial limma
clst_tidy_2 <- lapply(clst_tidy, function(x) {
  if (length(which(names(x) %in% sig_lim) == T) >= 1){
    a <- which(names(x) %in% sig_lim)
    x[, c(1, a)]
  }
})

# remove NULL entries
clst_tidy_3 <- clst_tidy_2[-which(sapply(clst_tidy_2, is.null))]

# how many clusters?
length(clst_tidy_3)
## [1] 199
# how many unique genes in those clusters?
clst_de_gns <- lapply(clst_tidy_3, function (x) {
  names(x)[-1]
})
clst_de_gns_1 <- unique(as.character(unlist(clst_de_gns))) # take only unique probe IDs and remove "X" prefix
length(clst_de_gns_1)
## [1] 168

Before running END on this new filtered set, we’ll extract a few of the clusters with the most differentially expressed genes in them to see if the PET-pos and -neg groups separate with PCA.

tail(sort(sapply(clst_tidy_2, length)), 6) # get names of six clusters with the most DE genes
## hsa04151 hsa04510 hsa05225 hsa05165 hsa05205 hsa05200 
##       10       10       10       12       12       25
t <- attr(tail(sort(sapply(clst_tidy_2, length)), 6), "names") # create vector of cluster names

# create list of just those six
clst_test <- clst_tidy_3[t]

# run loop over list to create plots for each
cluster_plots <- lapply(names(clst_test), function(namex) {
  df <- data.frame(clst_test[[namex]], stringsAsFactors = F)
  colnames(df)[1] <- "X"
  df1 <- left_join(keymeta1, df, by = c("AIBL.Id" = "X"))
  
  pca1 <- prcomp(df1[6:ncol(df1)], center = T, scale. = T)
  library(ggfortify)
  plot <- autoplot(pca1, data = df1, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle(namex)
  print(plot)
})

There’s some separation, but not a lot. What if we combine them? That is, we take all the differentially expressed genes that also fell within networks.

# get expression data for the 168 DE genes that fell within the networks
test2 <- cbind(exdata3[, 1:5], exdata3[, sig_lim])
pca2 <- prcomp(test2[6:ncol(test2)], center = T, scale. = T)
autoplot(pca2, data = test2, colour = "PET", label = T, label.label = "AIBL.Id") +
  viridis::scale_colour_viridis(option = "viridis", discrete = T)

Seems to be a bit of a dead end.

ll2 <- lapply(clst_tidy_3, function (x) {
  xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
  svd.decomp2 <- svd(xm) # applies singular value decomposition
  svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
  X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
  xout <-  X.pca[,1] # takes the first eigemvalue for each individual
  return(xout)
})

Turn it into a data frame

ll2d <- do.call(rbind.data.frame,ll2) # creates a data frame of eigenvalues
ll2d2 <- t(ll2d) # transpose so that individuals are rows
rownames(ll2d2) <- clst_tidy_3[[1]][,1]
colnames(ll2d2) <- names(clst_tidy_3)
ll2d3 <- ll2d2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data

Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
ll2d4 <- ll2d3[kp,] # remove individuals that do not have PET status recorded
ll2d4 <- ll2d4[-c(grep(c("771|914|918"), rownames(ll2d4))), ] # remove outliers identified with PCA on expression data
treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
design_treat1<-model.matrix(~0+treat1) # create model matrix
colnames(design_treat1) <- c("Normal","Alzheimers")
contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast

Visualise eigenvalues to see if they separate the groups

require(tidyverse)
ll2d4.1 <- data.frame(ll2d4)
ll2d4.1$AIBL.Id <- as.integer(rownames(ll2d4.1))
vis2 <- left_join(ll2d4.1, keymeta1, by = "AIBL.Id")
vis2 <- gather(vis2, key = "network", value = "eigenvalue", 1:(ncol(vis2) - 5))
ggplot(vis2, aes(network, eigenvalue, colour = PET)) +
  geom_jitter() # most networks seeem to have a pretty even distribution between POS and NEG groups

Run limma

# Transpose data into matrix for limma fit
x1 <- t(ll2d4)
fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
fit.eb1<- eBayes(fit1.c) # compute Bayes statistics

## comparison treated - untreated
topALL2<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=244, genelist=colnames(ll2d4))
topALL.2 <- topALL2[,c("ID","P.Value", "adj.P.Val")]

# number of significantly differentiated networks 
length(which(topALL.2$P.Value < 0.05))
## [1] 129
# number of significantly differentiated networks (FDR-adjusted)
length(which(topALL.2$adj.P.Val < 0.05))
## [1] 88

Join results to ‘root_lim’ table

root_lim <- dplyr::left_join(root_lim,
                      dplyr::select(topALL.2,
                                    cluster_id = "ID",
                                    end_cluster_de_p.value = "P.Value",
                                    end_cluster_de_p.value_adj = "adj.P.Val"),
                      by = "cluster_id")

3.1.1 Carry out PCA on networks that are significantly differentiated at the FDR adjusted level

# create data frame with attached metadata
fdr_sig <- topALL.2$ID[topALL.2$adj.P.Val < 0.05]# create vector of significant network IDs 
ll2d4.2 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.2$AIBL.Id <- as.integer(rownames(ll2d4.2))
ll2d4.2 <- left_join(ll2d4.2, keymeta1, by = "AIBL.Id")
ll2d4.2 <- dplyr::select(ll2d4.2, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

# carry out PCA
pca3 <- prcomp(ll2d4.2[6:ncol(ll2d4.2)], center = T, scale. = T)

autoplot(pca3, data = ll2d4.2, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle("PCA with significant clusters (FDR-adjusted)")

How about if we just take the top 10 clusters?

# create data frame with attached metadata
fdr_sig_10 <- topALL.2$ID[1:10]# create vector of significant network IDs 
ll2d4.3 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_10] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.3$AIBL.Id <- as.integer(rownames(ll2d4.3))
ll2d4.3 <- left_join(ll2d4.3, keymeta1, by = "AIBL.Id")
ll2d4.3 <- dplyr::select(ll2d4.3, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

# carry out PCA
pca3 <- prcomp(ll2d4.3[6:ncol(ll2d4.3)], center = T, scale. = T)

autoplot(pca3, data = ll2d4.3, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle("PCA with top 10 significant clusters (FDR-adjusted)")

How about if we just take the top 5 clusters?

# create data frame with attached metadata
fdr_sig_5 <- topALL.2$ID[1:5]# create vector of significant network IDs 
ll2d4.4 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_5] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.4$AIBL.Id <- as.integer(rownames(ll2d4.4))
ll2d4.4 <- left_join(ll2d4.4, keymeta1, by = "AIBL.Id")
ll2d4.4 <- dplyr::select(ll2d4.4, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

# carry out PCA
pca4 <- prcomp(ll2d4.4[6:ncol(ll2d4.4)], center = T, scale. = T)

autoplot(pca4, data = ll2d4.4, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle("PCA with top 5 significant clusters (FDR-adjusted)")

How about if we just take the top 2 clusters?

# create data frame with attached metadata
fdr_sig_2 <- topALL.2$ID[1:2]# create vector of significant network IDs 
ll2d4.5 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_2] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.5$AIBL.Id <- as.integer(rownames(ll2d4.5))
ll2d4.5 <- left_join(ll2d4.5, keymeta1, by = "AIBL.Id")
ll2d4.5 <- dplyr::select(ll2d4.5, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

# carry out PCA
pca5 <- prcomp(ll2d4.5[6:ncol(ll2d4.5)], center = T, scale. = T)

autoplot(pca5, data = ll2d4.5, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle("PCA with top 2 significant clusters (FDR-adjusted)")

How about with all clusters at the nominal p-value?

# create data frame with attached metadata
fdr_sig_all <- topALL.2$ID[topALL.2$P.Value < 0.05]# create vector of significant network IDs 
ll2d4.6 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_all] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.6$AIBL.Id <- as.integer(rownames(ll2d4.6))
ll2d4.6 <- left_join(ll2d4.6, keymeta1, by = "AIBL.Id")
ll2d4.6 <- dplyr::select(ll2d4.6, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

# carry out PCA
pca6 <- prcomp(ll2d4.6[6:ncol(ll2d4.6)], center = T, scale. = T)

autoplot(pca6, data = ll2d4.6, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle("PCA with all significant clusters (nominal p-value)")

3.2 Enrich for genes significant eQTL genes

Find which of the significant DE genes from the initial limma were also significant from the eQTL analysis of the PET-pos group

which(lim1_out_sig$probe_id %in% pos_sig_eqtl_gns$gene)
## integer(0)

None!

How about the eQTL analysis from the group as a whole?

which(lim1_out_sig$probe_id %in% sig_eqtl_gns$gene)
##  [1]  29  53  70  80  82  85  91 104 105 112 130 131 135 141 145 166
# get probe IDs for the DE eQTL genes
eqtl_sig_probes <- lim1_out_sig$probe_id[lim1_out_sig$probe_id %in% sig_eqtl_gns$gene]

# create list of cluster IDs that contain genes that were DE in the eQTL analysis
test <- lapply(clst_tidy, function(x){
  if (length(which(names(x) %in% eqtl_sig_probes) >= 1)){
    return(x)
  }
})
test <- test[-which(sapply(test, is.null))] # remove NULL entries

# create vector of cluster IDs
eqtl_cluster_ids <- names(test)

ll2d4.7 <- ll2d4.1[, colnames(ll2d4.1) %in% eqtl_cluster_ids]  # restrict to just those networks that hold genes that were DE in the eQTL analysis
ll2d4.7$AIBL.Id <- as.integer(rownames(ll2d4.7))
ll2d4.7 <- left_join(ll2d4.7, keymeta1, by = "AIBL.Id")
ll2d4.7 <- dplyr::select(ll2d4.7, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

# carry out PCA
pca7 <- prcomp(ll2d4.7[6:ncol(ll2d4.7)], center = T, scale. = T)

autoplot(pca7, data = ll2d4.7, colour = "PET", label = T, label.label = "AIBL.Id") +
    viridis::scale_colour_viridis(option = "viridis", discrete = T) +
    ggtitle("PCA differentially expressed eQTL genes")

3.3 Get filtered gene list

load(url("https://github.com/IanBrettell/AD_project/raw/master/Working/networks/dingo_filtered_probes.RData"))
load(url("https://github.com/IanBrettell/AD_project/raw/master/Working/networks/filtered_clusters.RData"))

# how many unique probes are there in the filtered iDINGO output?
length(dingo_filtered_probes)
## [1] 3434

9 February 2018

3.4 Summary tables

Counts of genes and networks comparing PET-pos v PET-neg groups
No. unique genes (probes) sig. at p < 0.05 (annotated) No. genes (probes) sig. FDR-adjusted level (annotated) No. networks
DE in limma (1) 865 (533) 0 244
Sig. diff. correlation from iDINGO 3434 (3434) ? 249
No. sig. genes from (1) in networks 168 (168) NA 199

For Monday:

  • Increase the threshold for the number of DE genes in each cluster from 1 to say, 5, create eigenvalues, for those, and then see how much better the PCA gets. Or just take the top 10 clusters.

  • Find which differentially expressed eQTL genes are within the FDR-adjusted significant networks of the limma run on the above.

  • Create boxplots for those genes by genotype, APOE-e4, and PET status.


13 February 2018

3.5 PCA for clusters with most DE genes in them

3.5.1 Create function to create eigenvalues and run limma

end_eigen <- function(cluster_list){
  # create eigenvalues
  ll <- lapply(cluster_list, function (x) {
    xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
    svd.decomp2 <- svd(xm) # applies singular value decomposition
    svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
    X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
    xout <-  X.pca[,1] # takes the first eigemvalue for each individual
    return(xout)
  })
  
  # turn it into a data frame
  lld <- do.call(rbind.data.frame,ll) # creates a data frame of eigenvalues
  lld2 <- t(lld) # transpose so that individuals are rows
  rownames(lld2) <- cluster_list[[1]][,1]
  colnames(lld2) <- names(cluster_list)

  # filter data and create contrasts
  require(RCurl)
  keymeta <- read.delim(text = getURL("https://raw.githubusercontent.com/IanBrettell/AD_project/master/Working/key_metadata.txt"), header = T) # read key metadata
  lld3 <- lld2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
  kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
  keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
  keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
  Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
  lld4 <- lld3[kp,] # remove individuals that do not have PET status recorded
  lld4 <- lld4[-c(grep(c("771|914|918"), rownames(lld4))), ] # remove outliers identified with PCA on expression data
  treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
  design_treat1<-model.matrix(~0+treat1) # create model matrix
  colnames(design_treat1) <- c("Normal","Alzheimers")
  contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast

  # run limma
  x1 <- t(lld4) # Transpose data into matrix for limma fit
  fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
  fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
  fit.eb1<- eBayes(fit1.c) # compute Bayes statistics
  topALL1<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=length(cluster_list), genelist=colnames(lld4))## comparison treated - untreated
}

3.5.2 Create function to create eigenvalues, run limma and plot

# create function for PCA analysis

end_pca_plot <- function(cluster_list, title){
  
# create eigenvalues
  ll <- lapply(cluster_list, function (x) {
    xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
    svd.decomp2 <- svd(xm) # applies singular value decomposition
    svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
    X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
    xout <-  X.pca[,1] # takes the first eigemvalue for each individual
    return(xout)
  })
  
  # turn it into a data frame
  lld <- do.call(rbind.data.frame,ll) # creates a data frame of eigenvalues
  lld2 <- t(lld) # transpose so that individuals are rows
  rownames(lld2) <- cluster_list[[1]][,1]
  colnames(lld2) <- names(cluster_list)

  # filter data and create contrasts
  keymeta <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T) # read key metadata
  lld3 <- lld2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
  kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
  keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
  keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
  Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
  lld4 <- lld3[kp,] # remove individuals that do not have PET status recorded
  lld4 <- lld4[-c(grep(c("771|914|918"), rownames(lld4))), ] # remove outliers identified with PCA on expression data
  treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
  design_treat1<-model.matrix(~0+treat1) # create model matrix
  colnames(design_treat1) <- c("Normal","Alzheimers")
  contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast

  # run limma
  x1 <- t(lld4) # Transpose data into matrix for limma fit
  fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
  fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
  fit.eb1<- eBayes(fit1.c) # compute Bayes statistics
  topALL1<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=length(cluster_list), genelist=colnames(lld4))## comparison treated - untreated
  
  # create data frame with attached metadata
  if (length(topALL1$ID[topALL1$adj.P.Val < 0.05]) > 0){
    fdr_sig <- topALL1$ID[topALL1$adj.P.Val < 0.05]# create vector of significant network IDs
  } else {
    fdr_sig <- topALL1$ID[topALL1$P.Value < 0.05]
  }
  lld4.1 <- data.frame(lld4)
  lld4.2 <- lld4.1[, colnames(lld4.1) %in% fdr_sig] # restrict to just those networks that are significant at the FDR-adjusted level
  lld4.2$AIBL.Id <- as.integer(rownames(lld4.2))
  require(dplyr)
  lld4.2 <- left_join(lld4.2, keymeta1, by = "AIBL.Id")
  lld4.2 <- dplyr::select(lld4.2, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())

  # run PCA
  pca <- prcomp(lld4.2[6:ncol(lld4.2)], center = T, scale. = T)

  # plot
  require(ggplot2)
  require(ggfortify)
  plot <- autoplot(pca, data = lld4.2, colour = "PET", label = T, label.label = "AIBL.Id") +
      viridis::scale_colour_viridis(option = "viridis", discrete = T) +
      ggtitle(title)
  return(plot)
}

3.5.3 Top 10 clusters with most DE genes

tail(sort(sapply(clst_tidy_2, length)), 10) # get names of six clusters with the most DE genes
## hsa05169 hsa04360 hsa04740 hsa05206 hsa04151 hsa04510 hsa05225 hsa05165 
##        8        9        9        9       10       10       10       12 
## hsa05205 hsa05200 
##       12       25
t <- attr(tail(sort(sapply(clst_tidy_2, length)), 10), "names") # create vector of cluster names

# create list of just those six
clst_top_10 <- clst_tidy_3[t]

Run function on ‘clst_top_10’

end_pca_plot(clst_top_10, "PCA using 10 clusters with the most DE genes")

3.5.4 Top 5 clusters with most DE genes

tail(sort(sapply(clst_tidy_2, length)), 5) # get names of six clusters with the most DE genes
## hsa04510 hsa05225 hsa05165 hsa05205 hsa05200 
##       10       10       12       12       25
t <- attr(tail(sort(sapply(clst_tidy_2, length)), 5), "names") # create vector of cluster names

# create list of just those six
clst_top_5 <- clst_tidy_3[t]

Run function on ‘clst_top_5’

end_pca_plot(clst_top_5, "PCA using 5 clusters with the most DE genes")

3.6 Analyse filtered clusters from iDINGO

Meeting with JD:

Take average of absolute z-scores. Take 5th percentile from either end. Get mean, median, SD -This will let us know which clusters separate the two groups the best Dingo -Take the average of the bootstrap replications (the difference). The better that does, the smaller the error around the bootstrap average.

Load files

# load files from dropbox
load("~/Dropbox/eQTL/Results/cluster_data/iDINGO_results/RData/All_clusters.RData")
# create list
dingo_clstrs <- mget(ls(pattern = "hsa"))
# remove indidivual cluster lists
rm(list = ls(pattern = "hsa"))

Get the mean of each gene pair’s bootstrap

dingo_clstrs_2 <- lapply(dingo_clstrs, function(x){
  apply(x[["bootdiff"]], 1, mean)
})

Find the indexes for the gene pairs that sit in the extreme tails (<2.5%, >97.5%)

dingo_clstrs_3 <- lapply(dingo_clstrs_2, function(x){
  which(x < quantile(x, 0.025) | x > quantile(x, 0.975))
})

Get gene pairs corresponding to indexes

# create function to extract gene pairs
ind_match <- function(x, y){
  list(x[["genepairs"]][y, ])
}

# run over both lists
dingo_clstrs_4 <- mapply(ind_match, dingo_clstrs, dingo_clstrs_3)

Get unique genes in each network

dingo_clstrs_5 <- lapply(dingo_clstrs_4, function(x){
  a <- unique(x[["gene1"]])
  b <- unique(x[["gene2"]])
  union(a, b)
})

Pull out expression data for those genes

# find which clusters in 'dingo_clstrs_5 aren't in 'cluster_genexp' and remove
dingo_clstrs_6 <- dingo_clstrs_5[which(names(dingo_clstrs_5) %in% names(cluster_genexp))]

# test
# names(cluster_genexp[[2]])[names(cluster_genexp[[2]]) %in% dingo_clstrs_5[[2]]]

# create function to find which genes in 'cluster_genexp' that are in 'dingo_clstrs_6' 
gene_match <- function(x, y){
  cbind(x[1], x[names(x) %in% y])
}

# extract expression data for genes in 'cluster_genexp' that are in 'dingo_clstrs_6'
dingo_clstrs_7 <- mapply(gene_match, cluster_genexp, dingo_clstrs_6)
end_pca_plot(dingo_clstrs_7, "PCA using END genes (5% threshold)")

Not great clustering. Try higher threshold?

3.6.1 PCA with END at 1 percentile threshold

Create function to END dingo analysis

end_filter <- function(list, lower, upper){
  
  # get list of means for the bootstrapped difference
  dingo_clstrs_2 <- lapply(list, function(x){
    apply(x[["bootdiff"]], 1, mean)
  })
  
  # Find the indexes for the gene pairs that sit in the extreme tails
  dingo_clstrs_3 <- lapply(dingo_clstrs_2, function(x){
    which(x < quantile(x, upper) | x > quantile(x, lower))
  })
  
  # get gene pairs corresponding to indexes
  ind_match <- function(x, y){ # create function to extract gene pairs
    list(x[["genepairs"]][y, ])
  }
  
  dingo_clstrs_4 <- mapply(ind_match, dingo_clstrs, dingo_clstrs_3) # run over both lists
  
  # get unique genes in each network
  dingo_clstrs_5 <- lapply(dingo_clstrs_4, function(x){
    a <- unique(x[["gene1"]])
    b <- unique(x[["gene2"]])
    union(a, b)
  })
  
  # pull out expression data for those genes
  dingo_clstrs_6 <- dingo_clstrs_5[which(names(dingo_clstrs_5) %in% names(cluster_genexp))] # find which clusters in 'dingo_clstrs_5 aren't in 'cluster_genexp' and remove
  
  gene_match <- function(x, y){ # create function to find which genes in 'cluster_genexp' that are in 'dingo_clstrs_6' 
    cbind(x[1], x[names(x) %in% y])
  }
  
  dingo_clstrs_7 <- mapply(gene_match, cluster_genexp, dingo_clstrs_6) # extract expression data for genes in 'cluster_genexp' that are in 'dingo_clstrs_6'
  
  # remove clusters with 0 genes
  if (any(sapply(dingo_clstrs_7, function(x) length(x) < 2))){
    dingo_clstrs_8 <- lapply(dingo_clstrs_7, function(x) { # create NULL entries
    if (length(x) < 2){
      x <- NULL
    } else {
      x
    }
    })
    dingo_clstrs_8 <- dingo_clstrs_8[-which(sapply(dingo_clstrs_8, is.null))]
    dingo_clstrs_8
  } else {
    dingo_clstrs_7
  }
}

Run with 1% on either side

one_percentile <- end_filter(dingo_clstrs, 0.01, 0.99)
end_pca_plot(one_percentile, "PCA using END genes (1% threshold)")

3.6.2 PCA with END at 0.5 percentile threshold

half_percentile <- end_filter(dingo_clstrs, 0.005, 0.995)
end_pca_plot(half_percentile, "PCA using END genes (0.5% threshold)")

3.6.3 PCA with END at 0.25 percentile threshold

quarter_percentile <- end_filter(dingo_clstrs, 0.0025, 0.9975)
end_pca_plot(quarter_percentile, "PCA using END genes (0.25% threshold)")

3.6.4 PCA with END at 0.1 percentile threshold

point1_percentile <- end_filter(dingo_clstrs, 0.001, 0.999)
end_pca_plot(point1_percentile, "PCA using END genes (0.1% threshold)")


19 February 2018

Meeting with JD and MV

*Create a table of p-values: - one column for the intial limma results for the root gene of the network - one column for the limma results from the END for the networks containing just the genes that were significantly DE in the initial limma - one column for the limma results from the END for the networks containing just the genes that were filtered using the iDINGO results

20 February 2018

4 Comparison between approaches

4.1 Get table of results from initial limma

# load table of original limma results
sig_PET_pos_v_neg <- read.table(text = getURL("https://raw.githubusercontent.com/IanBrettell/AD_project/master/Working/sig_PET_pos_v_neg.txt"),
                                header = T)

#select and rename relevant columns
init_lim_table <- sig_PET_pos_v_neg[!is.na(sig_PET_pos_v_neg$gene_symbol), ] %>%
  dplyr::select(probe_id, gene_symbol, logFC_init_limma = "logFC", p_value_init_limma = "P.Value", FDR_init_limma = "adj.P.Val")

# convert to character to later merging
init_lim_table$probe_id <- as.character(init_lim_table$probe_id)

4.2 Get table of results from limma from END for networks containing just DE genes

4.2.1 What we did

# read table with gene-cluster information
gns_in_clstrs <- read.table("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_PC1.csv", sep = ",", header = T)
gns_in_clstrs <- dplyr::rename(gns_in_clstrs, probe_id = "X")

# get names of all the clusters that each gene falls within
hsas <- apply(gns_in_clstrs, 1, function(x) {
  colnames(gns_in_clstrs[which(!is.na(x))][-1])
})
names(hsas) <- gns_in_clstrs$probe_id

# turn 'hsas' list into a data frame column
hsas_df <- tibble::enframe(hsas)

# bind with initial limma results
p_table_1 <- left_join(init_lim_table, 
                       dplyr::select(hsas_df, name, networks = "value"), 
                       by = c("probe_id" = "name"))

How many of the initial DE genes fell within netorks?

length(which(sapply(p_table_1$networks, is.null) == F))
## [1] 168

So only 0 of the DE probes from the original limma fell within a network.

Bind with ‘genemap’

genemap$root_gene_probe_id <- as.character(genemap$root_gene_probe_id)
p_table_2 <- dplyr::left_join(p_table_1, 
                              dplyr::select(genemap, root_gene_probe_id, rooted_network = "hsa", name),
                              by = c("probe_id" = "root_gene_probe_id"))

Get END results for clusters containing DE genes only

# get table of p-values
end_clsts_de_gns_only <- end_eigen(clst_tidy_3)

# join to table
p_table_3 <- dplyr::left_join(p_table_2,
                              dplyr::select(end_clsts_de_gns_only, ID, logFC_END_DE_only = "logFC", p_value_END_DE_only = "P.Value", FDR_END_DE_only = "adj.P.Val"),
                              by = c("rooted_network" = "ID"))

4.3 Get table of results for END-filtered networks

For 0.5% threshold

end_results_0.5 <- end_eigen(half_percentile)
# join to table
p_table_4 <- dplyr::left_join(p_table_3,
                              dplyr::select(end_results_0.5, ID, logFC_END_0.5 = "logFC", p_value_END_0.5 = "P.Value", FDR_END_0.5 = "adj.P.Val"),
                              by = c("rooted_network" = "ID"))

For 0.25% threshold

end_results_0.25 <- end_eigen(quarter_percentile)
# join to table
p_table_5 <- dplyr::left_join(p_table_4,
                              dplyr::select(end_results_0.25, ID, logFC_END_0.25 = "logFC", p_value_END_0.25 = "P.Value", FDR_END_0.25 = "adj.P.Val"),
                              by = c("rooted_network" = "ID"))

For 0.1% threshold

end_results_0.1 <- end_eigen(point1_percentile)
# join to table
p_table_6 <- dplyr::left_join(p_table_5,
                              dplyr::select(end_results_0.1, ID, logFC_END_0.1 = "logFC", p_value_END_0.1 = "P.Value", FDR_END_0.1 = "adj.P.Val"),
                              by = c("rooted_network" = "ID"))

For 1% threshold, and filter data for plotting

p_table_7 <- p_table_6[!is.na(p_table_6$rooted_network), ]

# add results for END 1 percentile
end_results_1 <- end_eigen(one_percentile)

p_table_7 <- dplyr::left_join(p_table_7,
                              dplyr::select(end_results_1, ID, logFC_END_1 = "logFC", p_value_END_1 = "P.Value", FDR_END_1 = "adj.P.Val"),
                              by = c("rooted_network" = "ID"))

4.3.1 Create function for volcano plots

vol_plot <- function(limma_output, xvar = "logFC_column", yvar = "p_value_column", alpha = 0.05, title = "Volcano plot", FDR = c(T, F), label_limit = alpha, label_type = c("gene", "network"), xlim_start = -0.15, xlim_end = 0.2){
  # load required packages
  require(tidyverse)
  require(ggrepel)
  
  # create data frame for manipulation
  data <- limma_output
  data$xcol <- data[, xvar]
  data$ycol <- data[, yvar]
  data$sig <- ifelse(data$ycol < alpha, "sig", "not sig") # create column for sign/non-sig
  data$ytrans <- -log10(data$ycol) # log transform p-values
  
  if (FDR == T){
    lab <- expression(-log[10]("FDR"))
  } 
  if (FDR == F){
    lab <- expression(-log[10]("p-value"))
  }
  
  # run ggplot
  p <-  ggplot(data = data,
          aes(x = xcol, y = ytrans),
          environment = environment()) +
  geom_point(aes(col = sig)) +
  scale_colour_manual(values = c("sig" = "#F8766D", "not sig" = "#00BFC4")) +
  geom_hline(yintercept = -log10(alpha)) +
  xlim(xlim_start, xlim_end) +
  ylim(0, 4) +
  xlab(expression(log[2]("fold change"))) +
  ylab(lab) +
  ggtitle(paste(title, " (alpha = ", alpha, ")", sep = ""))

  # create labels based on either gene or network
  ifelse(label_type == "gene",
         return(p + geom_text_repel(data = dplyr::filter(data, ycol < label_limit),
                                    aes(label = gene_symbol))),
         ifelse(label_type == "network",
                return(p + geom_text_repel(data = dplyr::filter(data, ycol < label_limit),
                                           aes(label = name))),
                return(p)))
  
}

4.3.2 Volcano plots comparing END results at different thresholds

plot_END_1 <- vol_plot(p_table_7, xvar = "logFC_END_1", yvar = "FDR_END_1", title = "END with genes in 1 percentile", FDR = T)
## Loading required package: ggrepel
plot_END_0.5 <- vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "FDR_END_0.5", title = "END with genes in 0.5 percentile", FDR = T)

plot_END_0.25 <- vol_plot(p_table_7, xvar = "logFC_END_0.25", yvar = "FDR_END_0.25", title = "END with genes in 0.25 percentile", FDR = T)

plot_END_0.1 <- vol_plot(p_table_7, xvar = "logFC_END_0.1", yvar = "FDR_END_0.1", title = "END with genes in 0.1 percentile", FDR = T)

plot_END_1
## Warning: Removed 4 rows containing missing values (geom_point).

plot_END_0.5
## Warning: Removed 4 rows containing missing values (geom_point).

plot_END_0.25
## Warning: Removed 4 rows containing missing values (geom_point).

plot_END_0.1
## Warning: Removed 4 rows containing missing values (geom_point).

4.3.3 Volcano plots comparing nominal p-values

vol_plot(p_table_7, xvar = "logFC_init_limma", yvar = "p_value_init_limma", title = "Initial limma: PET-pos v -neg", FDR = F, label_limit = 0.001)
## Warning: Removed 3 rows containing missing values (geom_point).

vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "p_value_END_0.5", title = "END with genes in 0.5 percentile", FDR = F, label_limit = 0.001)
## Warning: Removed 4 rows containing missing values (geom_point).

vol_plot(p_table_7, xvar = "logFC_END_DE_only", yvar = "p_value_END_DE_only", title = "END with DE genes only", FDR = F, label_limit = 0.001, label_type = "gene")
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_text_repel).

4.3.4 Volcano plots comparing FDR p-values

a <- vol_plot(p_table_7, xvar = "logFC_init_limma", yvar = "FDR_init_limma", title = "Initial limma: PET-pos v -neg", FDR = T)
b <- vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "FDR_END_0.5", title = "END with genes in 0.5 percentile", FDR = T)
c <- vol_plot(p_table_7, xvar = "logFC_END_DE_only", yvar = "FDR_END_DE_only", title = "END with DE genes only", FDR = T, label_limit = 0.005, label_type = "gene")

# redo with network names instead

d <- vol_plot(p_table_7, xvar = "logFC_init_limma", yvar = "FDR_init_limma", title = "Initial limma: PET-pos v -neg", FDR = T)
e <- vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "FDR_END_0.5", title = "END with genes in 0.5 percentile", FDR = T, label_type = "network")
f <- vol_plot(p_table_7, xvar = "logFC_END_DE_only", yvar = "FDR_END_DE_only", title = "END with DE genes only", FDR = T, label_limit = 0.005, label_type = "network")

a
## Warning: Removed 3 rows containing missing values (geom_point).

b
## Warning: Removed 4 rows containing missing values (geom_point).

c
## Warning: Removed 49 rows containing missing values (geom_point).

d
## Warning: Removed 3 rows containing missing values (geom_point).

e
## Warning: Removed 4 rows containing missing values (geom_point).

f
## Warning: Removed 49 rows containing missing values (geom_point).